#hide !pip install nbdev from nbdev import * ! nbdev_upgrade

Introduction

Binder Binder Binder Open Source Love svg3

NPM License Active Python Versions GitHub last commit No Maintenance Intended

GitHub stars GitHub watchers GitHub forks GitHub followers

Tweet Twitter Follow

This chapter has brought to you in collaboration by Brian Kelly, Michael Vandi, Logan Shertz, and Charles Karpati.

Dataset: Scooter data:

# do you see this? - Carlos# Hello world! - Michael# Checking - Brian

Local File Access (Optional)

# hide_output # (Optional) Run this cell to gain access to Google Drive (Colabs only) from google.colab import drive # Colabs operates in a virtualized enviornment # Colabs default directory is at ~/content. # We mount Drive into a temporary folder at '~/content/drive' drive.mount('/content/drive')Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code Enter your authorization code: ·········· Mounted at /content/drive

File path's will vary

# cd ./drive/'My Drive'/BNIA/responsive_records/Routes# cd ../content/drive/My Drive/DATA/scooter/content/drive/My Drive/DATA/scooter # Michael's Directory # cd ./drive/'My Drive'/BNIA/'Scooter Use Data'/BNIAcd ./Routes/content/drive/My Drive/DATA/scooter/Routes # hide_output ! ls leftRouts.csv 'Routing November 2019.geojson' rightRouts.csv 'Routing October 2019.geojson' 'Routing August 2019.geojson' 'Routing September 2019.geojson' 'Routing December 2019.geojson' # hide_output !cd ../ && ls boundsdf.csv rightRouts.csv 'Trip origins-destinations by month' Deployment Routes leftRouts.csv scooterdf.csv

Installs

! pip install dexplot !pip install folium !pip install geopandas !pip install ipyleaflet !pip install gpdvega !pip install dataplay

Imports

import os import pandas as pd import geopandas as gpd import dexplot as dxp import folium as fol import json import altair as alt import gpdvega import dataplay # These imports will handle everything import os import sys import csv from IPython.display import clear_output import matplotlib.pyplot as plt import numpy as np import pandas as pd import geopandas as gpd from geopandas import GeoDataFrame import psycopg2 import pyproj from pyproj import Proj, transform # conda install -c conda-forge proj4 from shapely.geometry import Point from shapely import wkb from shapely.wkt import loads # https://pypi.org/project/geopy/ from geopy.geocoders import Nominatim import folium from folium import plugins from dataplay.merge import mergeDatasets import dexplot as dxp # In case file is KML, enable support import fiona fiona.drvsupport.supported_drivers['kml'] = 'rw' fiona.drvsupport.supported_drivers['KML'] = 'rw' #this cell is good to copy from shapely.geometry import LineString pd.plotting.register_matplotlib_converters() from dataplay.geoms import readInGeometryData from shapely import wkt from dataplay.geoms import readInGeometryData import ipywidgets as widgets !jupyter nbextension enable --py widgetsnbextension from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = 'all' import ipywidgets as widgets from ipywidgets import interact, interact_manual alt.data_transformers.enable('default', max_rows=None)

Convenience Functions

# Return the boundaries of a Linestring def split(geom): print( str( type(geom)) ) #Linestring might not be the actual type, so this may need to be fied. #IF its not a linestring then dont run it and stuff 'false' and the datatype if str( type(geom)) == "" and not str(geom.boundary) == 'MULTIPOINT EMPTY': print(geom.boundary) left, right = geom.boundary print('left x: ', type(left.x), left.x) print('left y: ', type(left.y), left.y) print('right x: ', type(right.x), right.x) print('right y: ', type(right.y), right.y) return left.x, left.y, right.x, right.y else: return False, type(geom), False, type(geom)# To 'import' a script you wrote, map its filepath into the sys def getPointsInPolygons(pts, polygons, ptsCoordCol, polygonsCoordCol): count = 0 total = pts.size / len(pts.columns) # We're going to keep a list of how many points we find. pts_in_polys = [] # Loop over polygons with index i. for i, poly in polygons.iterrows(): # print('Searching for point within Geom:', poly ) # Keep a list of points in this poly pts_in_this_poly = [] # Now loop over all points with index j. for j, pt in pts.iterrows(): if poly[polygonsCoordCol].contains(pt[ptsCoordCol]): # Then it's a hit! Add it to the list, # and drop it so we have less hunting. pts_in_this_poly.append(pt[ptsCoordCol]) pts = pts.drop([j]) # We could do all sorts, like grab a property of the # points, but let's just append the number of them. pts_in_polys.append(len(pts_in_this_poly)) print('Found this many points within the Geom:', len(pts_in_this_poly) ) count = count + len(pts_in_this_poly) clear_output(wait=True) # Add the number of points for each poly to the dataframe. polygons['pointsinpolygon'] = gpd.GeoSeries(pts_in_polys) print( 'Total Points: ', total ) print( 'Total Points in Polygons: ', count ) print( 'Prcnt Points in Polygons: ', count / total ) return polygons

File Access Convenience Functions

# Find Relative Path to Files def findFile(root, file): for d, subD, f in os.walk(root): if file in f: return "{1}/{0}".format(file, d) break # To 'import' a script you wrote, map its filepath into the sys def addPath(root, file): sys.path.append(os.path.abspath( findFile( './', file) ))findFile('./', 'Routing September 2019.geojson')'.//Routing September 2019.geojson'ls leftRouts.csv 'Routing November 2019.geojson' rightRouts.csv 'Routing October 2019.geojson' 'Routing August 2019.geojson' 'Routing September 2019.geojson' 'Routing December 2019.geojson'

Inspect Deployment Data

ls 'Trip origins-destinations by month''Trip Destinations by block August 2019.geojson' 'Trip Destinations by block December 2019.geojson' 'Trip Destinations by block November 2019.geojson' 'Trip Destinations by block October 2019.geojson' 'Trip Destinations by block September 2019.geojson' 'Trip Origins by block August 2019.geojson' 'Trip Origins by block December 2019.geojson' 'Trip Origins by block November 2019.geojson' 'Trip Origins by block October 2019.geojson' 'Trip Origins by block September 2019.geojson' #@title Example form fields #@markdown Forms support many types of fields. fileName = "Trip Origins by block September 2019.geojson" #@param ['Daily Deployment average by block August 2019.csv', 'Daily Deployment average by block December 2019.csv', 'Daily Deployment average by block November 2019.csv', 'Daily Deployment average by block October 2019.csv', 'Daily Deployment average by block September 2019.csv', 'Trip Destinations by block August 2019.geojson','Trip Destinations by block December 2019.geojson','Trip Destinations by block November 2019.geojson', 'Trip Destinations by block October 2019.geojson', 'Trip Destinations by block September 2019.geojson', 'Trip Origins by block August 2019.geojson','Trip Origins by block December 2019.geojson','Trip Origins by block November 2019.geojson','Trip Origins by block October 2019.geojson','Trip Origins by block September 2019.geojson'] #@markdown ---gdf = gpd.read_file( findFile('./', fileName) ) gdf.head()
id name color radius value geometry
0 1 Block 245102502063018 #fff 4.999286 NaN POLYGON ((-76.66168 39.26342, -76.66136 39.26373, -76.66096 39.26343, -76.66041 39.26306, -76.66053 39.26296, -76.66070 39.26281, -76.66077 39.26284, -76.66100 39.26296, -76.66117 39.26306, -76.66129 39.26315, -76.66141 39.26322, -76.66168 39.26342))
1 2 Block 245102006001022 #fff 4.999286 NaN POLYGON ((-76.66319 39.28427, -76.66306 39.28436, -76.66301 39.28439, -76.66293 39.28439, -76.66271 39.28418, -76.66303 39.28402, -76.66317 39.28417, -76.66320 39.28423, -76.66319 39.28427))
2 3 Block 245102804033017 #fff 4.999286 NaN POLYGON ((-76.71122 39.27927, -76.71121 39.27975, -76.71121 39.27990, -76.71119 39.28076, -76.71045 39.27932, -76.71045 39.27920, -76.71056 39.27907, -76.71123 39.27880, -76.71122 39.27927))
3 4 Block 245102716006003 #fff 4.999286 NaN POLYGON ((-76.66960 39.34512, -76.66905 39.34526, -76.66886 39.34531, -76.66848 39.34539, -76.66835 39.34502, -76.66867 39.34498, -76.66924 39.34484, -76.66946 39.34475, -76.66960 39.34512))
4 5 Block 245102709023013 #fff 4.999286 NaN POLYGON ((-76.58976 39.35401, -76.58972 39.35415, -76.58966 39.35435, -76.58964 39.35447, -76.58960 39.35469, -76.58940 39.35467, -76.58925 39.35465, -76.58912 39.35465, -76.58902 39.35464, -76.58898 39.35464, -76.58906 39.35425, -76.58917 39.35385, -76.58925 39.35385, -76.58939 39.35386, -76.58979 39.35393, -76.58976 39.35401))
gdf.plot(column = 'value')
point_df = gdf.copy() point_df['centroids'] = df.centroid point_df = point_df.drop(columns = 'geometry') point_df = point_df.set_geometry('centroids') point_df.head(1) point_df.plot(marker='o', color='red', markersize='value')point_df.value.hist()point_df[point_df.value > 1000].plot()baltimore = gpd.read_file("https://opendata.arcgis.com/datasets/b738a8587b6d479a8824d937892701d8_0.geojson")from dataplay.geoms import workWithGeometryDatabaltimore.columnsIndex(['OBJECTID', 'CSA2010', 'tpop10', 'male10', 'female10', 'paa17', 'pwhite17', 'pasi17', 'p2more17', 'ppac17', 'phisp17', 'racdiv17', 'age5_17', 'age18_17', 'age24_17', 'age64_17', 'age65_17', 'hhs10', 'femhhs17', 'fam17', 'hhsize10', 'mhhi17', 'hh25inc17', 'hh40inc17', 'hh60inc17', 'hh75inc17', 'hhm7517', 'hhpov17', 'hhchpov17', 'Shape__Area', 'Shape__Length', 'geometry'], dtype='object')pointsInPolys = workWithGeometryData(method = 'ponp', df = point_df, polys = baltimore, ptsCoordCol ='centroids', polygonsCoordCol = 'geometry', polygonsLabel = 'CSA2010') pointsInPolys.plot(column='value', legend=True, markersize = 1)Total Points: 13506.0 Total Points in Polygons: 13453 Prcnt Points in Polygons: 0.9960758181548941
ponp_copy = pointsInPolys.copy()ponp_copy.value.isnull().groupby([ponp_copy['CSA2010']]).sum().astype(int).reset_index(name='NumberMissingCount').head()
CSA2010 NumberMissingCount
0 Allendale/Irvington/S. Hilton 290
1 Beechfield/Ten Hills/West Hills 180
2 Belair-Edison 250
3 Brooklyn/Curtis Bay/Hawkins Point 580
4 Canton 0
ponp_copy.value.notnull().groupby([ponp_copy['CSA2010']]).sum().astype(int) / ponp_copy.value.isnull().groupby([ponp_copy['CSA2010']]).sum().astype(int)ponp_copy.value.isnull().groupby([ponp_copy['CSA2010']]).sum().astype(int) / ponp_copy.value.groupby([ponp_copy['CSA2010']]).sum().astype(int) * 100ponp_copy.fillna(-1).groupby('CSA2010')['value'].sum()gdf.value.unique()gdf.value.describe()count 5677.000000 mean 40.862603 std 175.769408 min 1.000000 25% 2.000000 50% 5.000000 75% 21.000000 max 6978.000000 Name: value, dtype: float64gdf.value.var() # Return unbiased variance over requested axis.30894.884924369213gdf.value.sem() # Return unbiased standard error of the mean over requested axis.2.332834040372481gdf.value.nunique() # Count distinct observations over requested axis.364# DataFrame.shape Return a tuple representing the dimensionality of the DataFrame. gdf.shape #DataFrame.size Return an int representing the number of elements in this object. gdf.size # DataFrame.ndim Return an integer representing the number of axes/array dimensions. gdf.ndim # Note Used : # DataFrame.axes Return a list representing the axes of the DataFrame. gdf.dtypes # Return unbiased kurtosis over requested axis using Fisher’s definition of kurtosis (kurtosis of normal == 0.0). gdf.kurtosis()(13506, 6)810362id object name object color object radius float64 value float64 geometry geometry dtype: objectid -1.200000 radius 1132.851562 value 495.784213 dtype: float64

Load Tracts

# This will just beautify the output. pd.set_option('display.expand_frame_repr', False) pd.set_option('display.precision', 2) from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" pd.set_option('max_colwidth', 20)# The attributes are what we will use. in_crs = 2248 # The CRS we recieve our data. out_crs = 4326 # The CRS we would like to have our data represented as. geom = 'geometry' # The column where our spatial information lives. # A Url to load boundariesBaltimoreTractsNoWater2010 = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQ8xXdUaT17jkdK0MWTJpg3GOy6jMWeaXTlguXNjCSb8Vr_FanSZQRaTU-m811fQz4kyMFK5wcahMNY/pub?gid=886223646&single=true&output=csv" # Read in the dataframe gdf = pd.read_csv(boundariesBaltimoreTractsNoWater2010) # Convert the geometry column datatype from a string of text into a coordinate datatype. gdf['geometry'] = gdf['geometry'].apply(lambda x: loads( str(x) )) # Process the dataframe as a geodataframe with a known CRS and geom column. gdf = GeoDataFrame(gdf, crs=in_crs, geometry='geometry')boundariesBaltimoreTractsNoWater2010.head()

Ensure merge is on consistent datatypes

gdf['GEOID10'] = gdf['GEOID10'].astype(str) scooterdf['nameChange2'] = scooterdf['nameChange2'].astype(str)

Perform the merge

scooterdfClean = gdf.merge(scooterdf, left_on='GEOID10', right_on='nameChange2').drop(['name', 'nameChange1', 'nameChange2'], axis=1)scooterdfClean.head()# scooterdf.to_csv('./scooterdf.csv', index=False) # gdf.drop(columns='geometry').to_csv('./boundsdf.csv', index=False)lsscooterdfClean.groupby('CSA')['value'].sum()scooterdfClean.value.isna().groupby([scooterdfClean['CSA']]).sum().astype(int).reset_index(name='notApplicable')scooterdfClean.value.notnull().groupby([scooterdfClean['CSA']]).sum().astype(int).reset_index(name='NotMissingCount')scooterdfClean.value.isnull().groupby([scooterdfClean['CSA']]).sum().astype(int).reset_index(name='NumberMissingCount')scooterdfClean.fillna(-1).groupby('CSA')['value'].sum()scooterdfClean.groupby('CSA')['value'].mean()scooterdfClean.groupby('CSA')['value'].sum()scooterdfClean.CSA.value_counts()https://pandas.pydata.org/docs/getting_started/intro_tutorials/06_calculate_statistics.html

Inspect Routes Data

fileName = 'Routing December 2019.geojson' #@param ['Routing August 2019.geojson', 'Routing October 2019.geojson', 'Routing December 2019.geojson', 'Routing September 2019.geojson', 'Routing November 2019.geojson'] columnName = "streetname" #@param ['id', 'color', 'streetname', 'trip_count_sum', 'trip_count_average', 'trip_count_percent'] gdf = gpd.read_file( findFile('./', fileName) ) gdf.head()
id color streetname trip_count_sum trip_count_average trip_count_percent geometry
0 150197772 rgb(218, 231, 241) Jefferson Street 24 0.774194 0.0% LINESTRING (-76.58576 39.29660, -76.58550 39.29661)
1 150155955 rgb(164, 190, 219) 206 6.645161 0.3% LINESTRING (-76.61320 39.28136, -76.61318 39.28115)
2 150191673 rgb(204, 221, 236) Harford Avenue 49 1.580645 0.1% LINESTRING (-76.60169 39.30535, -76.60163 39.30544, -76.60156 39.30555)
3 150184657 rgb(229, 239, 246) 11 0.354839 0.0% LINESTRING (-76.61493 39.29294, -76.61491 39.29258, -76.61490 39.29230)
4 150188407 rgb(169, 194, 221) 178 5.741935 0.2% LINESTRING (-76.61662 39.29053, -76.61661 39.29046)
baltimore = gpd.read_file("https://opendata.arcgis.com/datasets/b738a8587b6d479a8824d937892701d8_0.geojson")tst = """background = alt.Chart(gdf).mark_geoshape( stroke='white', strokeWidth=2 ).encode( color=alt.value('#eee'), ).properties( width=700, height=500 )""" # GeoDataFrame could be passed as usual pd.DataFrame city = alt.Chart(baltimore).mark_geoshape( ).project( ).encode( color='tpop10', # shorthand infer types as for regular pd.DataFrame tooltip='CSA2010' # GeoDataFrame.index is accessible as id ).properties( width=500, height=300 ) routes = alt.Chart(gdf).mark_geoshape( filled=False, strokeWidth=2 ) city + routes

Clean the gdf of empties.

gdf = gdf[~gdf.isna()] gdf = gdf[~gdf.is_empty]

Now get the extremities; this will take a minute.

# hide_output gdf['lefty'], gdf['leftx'],gdf['righty'], gdf['rightx'] = zip(*gdf["geometry"].map(split))

Split the gdf into a left and right dataset

gdf_left = gdf.copy() gdf_left = gdf_left.drop(columns = ['geometry','rightx', 'righty']) gdf_right= gdf.copy() gdf_right = gdf_right.drop(columns = ['geometry','leftx', 'lefty', 'streetname', 'trip_count_sum', 'trip_count_average', 'trip_count_percent', 'color' ])

The coordinate variables of object type will cause problems.

gdf_right.dtypesid int64 righty float64 rightx object dtype: object

Let's go ahead and coerce a correction.

gdf_right['righty']=pd.to_numeric(gdf_right['righty'], errors='coerce') gdf_left['lefty']=pd.to_numeric(gdf_left['lefty'], errors='coerce') gdf_right['rightx']=pd.to_numeric(gdf_right['rightx'], errors='coerce') gdf_left['leftx']=pd.to_numeric(gdf_left['leftx'], errors='coerce')

Now we can view the results

gdf_right.dtypesid int64 righty float64 rightx float64 dtype: object

Save these csv's because it took a minute to get to where we are now.

gdf_right.to_csv('rightRouts.csv') gdf_left.to_csv('leftRouts.csv')

Convert the datasets to geodataframes for further exploration!

# We could create a geodataframe like this #temp_gdf = gpd.GeoDataFrame( gdf_right, geometry=gpd.points_from_xy(gdf_right.rightx, gdf_right.righty) ) #temp_gdf.head() # Alternately this could work.. unfinished.. but wkt.loads can make a Point from text # gdf_right['strCol']=gdf_right['rightx'].astype(str) # gdf_right['geometry'] = gdf_right['strCol'].apply(wkt.loads)# hide_output left_csaMap = readInGeometryData(url=gdf_left, porg='p', geom= False, lat= 'leftx', lng= 'lefty', revgeocode=False, save=False, in_crs=4268, out_crs=4268)# hide_output right_csaMap = readInGeometryData(url=gdf_right, porg='p', geom= False, lat= 'rightx', lng= 'righty', revgeocode=False, save=False, in_crs=4268, out_crs=4268)right_csaMap.head()
id righty rightx geometry
0 150197772 -76.585503 39.296607 POINT (-76.58550 39.29661)
1 150155955 -76.613183 39.281147 POINT (-76.61318 39.28115)
2 150191673 -76.601555 39.305545 POINT (-76.60156 39.30555)
3 150184657 -76.614899 39.292301 POINT (-76.61490 39.29230)
4 150188407 -76.616608 39.290458 POINT (-76.61661 39.29046)
left_csaMap.head()left_csaMap.plot(column='trip_count_sum')right_csaMap.plot()baltimore.columns# hide_output right_points_in_poly = getPointsInPolygons(right_csaMap, baltimore, 'geometry', 'geometry')right_csaMap.head()left_points_in_poly = getPointsInPolygons(left_csaMap, baltimore, 'geometry', 'geometry')left_csaMap.head()left_points_in_poly.drop('geometry', axis=1)[['pointsinpolygon']].head(20)
pointsinpolygon
0 18
1 0
2 67
3 0
4 1536
5 18
6 32
7 8
8 1
9 65
10 0
11 0
12 13
13 1764
14 0
15 760
16 0
17 85
18 1253
19 2
right_points_in_poly.drop('geometry', axis=1)[['pointsinpolygon']].head(20)
pointsinpolygon
0 18
1 0
2 67
3 0
4 1536
5 18
6 32
7 8
8 1
9 65
10 0
11 0
12 13
13 1764
14 0
15 760
16 0
17 85
18 1253
19 2
left_points_in_poly.plot(column='pointsinpolygon', legend=True)
right_points_in_poly.plot(column='pointsinpolygon', legend=True)
right_points_in_polyscooterdfClean = left_points_in_poly.merge(right_points_in_poly, left_on='CSA2010', right_on='CSA2010', how='left')scooterdfClean.columnsright_points_in_poly.columnsIndex(['OBJECTID', 'CSA2010', 'tpop10', 'male10', 'female10', 'paa17', 'pwhite17', 'pasi17', 'p2more17', 'ppac17', 'phisp17', 'racdiv17', 'age5_17', 'age18_17', 'age24_17', 'age64_17', 'age65_17', 'hhs10', 'femhhs17', 'fam17', 'hhsize10', 'mhhi17', 'hh25inc17', 'hh40inc17', 'hh60inc17', 'hh75inc17', 'hhm7517', 'hhpov17', 'hhchpov17', 'Shape__Area', 'Shape__Length', 'geometry', 'pointsinpolygon'], dtype='object')right_points_in_poly.head()right_points_in_poly.plot('pointsinpolygon', legend=True)
import altair as alt import geopandas as gpd import gpdvega # GeoDataFrame could be passed as usual pd.DataFrame chart = alt.Chart(right_points_in_poly).mark_geoshape( ).project( ).encode( color='pointsinpolygon', # shorthand infer types as for regular pd.DataFrame tooltip=['CSA2010','pointsinpolygon'] # GeoDataFrame.index is accessible as id ).properties( width=500, height=300 ) routes = alt.Chart(gdf).mark_geoshape( filled=False, strokeWidth=2 ) chart + routes chart.save(fileName[:-8]+'.html', embed_options={'renderer':'svg'}) chart.save(fileName[:-8]+'.json')

Inspect Origins-Destinations Data

ls 'Trip origins-destinations by month'# @title Example form fields #@markdown Forms support many types of fields. fileName = 'Trip Destinations by block August 2019.geojson' #@param ['Trip Destinations by block August 2019.geojson', 'Trip Destinations by block December 2019.geojson', 'Trip Destinations by block November 2019.geojson', 'Trip Destinations by block October 2019.geojson', 'Trip Destinations by block September 2019.geojson', 'Trip Origins by block August 2019.geojson', 'Trip Origins by block December 2019.geojson', 'Trip Origins by block November 2019.geojson', 'Trip Origins by block October 2019.geojson', 'Trip Origins by block September 2019.geojson'] columnName = "name" #@param ['name', 'value', 'color', 'radius'] #@markdown --- gdf = gpd.read_file( findFile('./', fileName) ) gdf.plot( column = columnName) gdf.columns gdf[['id','name', 'value', 'color', 'radius']].head(5)dxp.bar(x='color', y='value', data=gdf, aggfunc='median')dxp.scatter(x = "color", y = "value", data = gdf, aggfunc = "median")#hide !pip install nbdev from nbdev import * ! nbdev_upgrade

Introduction

Binder Binder Binder Open Source Love svg3

NPM License Active Python Versions GitHub last commit No Maintenance Intended

GitHub stars GitHub watchers GitHub forks GitHub followers

Tweet Twitter Follow

This chapter has brought to you in collaboration by Brian Kelly, Michael Vandi, Logan Shertz, and Charles Karpati.

Dataset: Scooter data:

  • Routes: 3 months (September to August 2019)
  • Deployment/
  • Routes
  • Trip origins-destinations by month
# do you see this? - Carlos# Hello world! - Michael# Checking - Brian

Local File Access (Optional)

# hide_output # (Optional) Run this cell to gain access to Google Drive (Colabs only) from google.colab import drive # Colabs operates in a virtualized enviornment # Colabs default directory is at ~/content. # We mount Drive into a temporary folder at '~/content/drive' drive.mount('/content/drive')Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code Enter your authorization code: ·········· Mounted at /content/drive

File path's will vary

# cd ./drive/'My Drive'/BNIA/responsive_records/Routes# cd ../content/drive/My Drive/DATA/scooter/content/drive/My Drive/DATA/scooter # Michael's Directory # cd ./drive/'My Drive'/BNIA/'Scooter Use Data'/BNIAcd ./Routes/content/drive/My Drive/DATA/scooter/Routes # hide_output ! ls leftRouts.csv 'Routing November 2019.geojson' rightRouts.csv 'Routing October 2019.geojson' 'Routing August 2019.geojson' 'Routing September 2019.geojson' 'Routing December 2019.geojson' # hide_output !cd ../ && ls boundsdf.csv rightRouts.csv 'Trip origins-destinations by month' Deployment Routes leftRouts.csv scooterdf.csv

Installs

! pip install dexplot !pip install folium !pip install geopandas !pip install ipyleaflet !pip install gpdvega !pip install dataplay

Imports

import os import pandas as pd import geopandas as gpd import dexplot as dxp import folium as fol import json import altair as alt import gpdvega import dataplay # These imports will handle everything import os import sys import csv from IPython.display import clear_output import matplotlib.pyplot as plt import numpy as np import pandas as pd import geopandas as gpd from geopandas import GeoDataFrame import psycopg2 import pyproj from pyproj import Proj, transform # conda install -c conda-forge proj4 from shapely.geometry import Point from shapely import wkb from shapely.wkt import loads # https://pypi.org/project/geopy/ from geopy.geocoders import Nominatim import folium from folium import plugins from dataplay.merge import mergeDatasets import dexplot as dxp # In case file is KML, enable support import fiona fiona.drvsupport.supported_drivers['kml'] = 'rw' fiona.drvsupport.supported_drivers['KML'] = 'rw' #this cell is good to copy from shapely.geometry import LineString pd.plotting.register_matplotlib_converters() from dataplay.geoms import readInGeometryData from shapely import wkt from dataplay.geoms import readInGeometryData import ipywidgets as widgets !jupyter nbextension enable --py widgetsnbextension from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = 'all' import ipywidgets as widgets from ipywidgets import interact, interact_manual alt.data_transformers.enable('default', max_rows=None)

Convenience Functions

# Return the boundaries of a Linestring def split(geom): print( str( type(geom)) ) #Linestring might not be the actual type, so this may need to be fied. #IF its not a linestring then dont run it and stuff 'false' and the datatype if str( type(geom)) == "" and not str(geom.boundary) == 'MULTIPOINT EMPTY': print(geom.boundary) left, right = geom.boundary print('left x: ', type(left.x), left.x) print('left y: ', type(left.y), left.y) print('right x: ', type(right.x), right.x) print('right y: ', type(right.y), right.y) return left.x, left.y, right.x, right.y else: return False, type(geom), False, type(geom)# To 'import' a script you wrote, map its filepath into the sys def getPointsInPolygons(pts, polygons, ptsCoordCol, polygonsCoordCol): count = 0 total = pts.size / len(pts.columns) # We're going to keep a list of how many points we find. pts_in_polys = [] # Loop over polygons with index i. for i, poly in polygons.iterrows(): # print('Searching for point within Geom:', poly ) # Keep a list of points in this poly pts_in_this_poly = [] # Now loop over all points with index j. for j, pt in pts.iterrows(): if poly[polygonsCoordCol].contains(pt[ptsCoordCol]): # Then it's a hit! Add it to the list, # and drop it so we have less hunting. pts_in_this_poly.append(pt[ptsCoordCol]) pts = pts.drop([j]) # We could do all sorts, like grab a property of the # points, but let's just append the number of them. pts_in_polys.append(len(pts_in_this_poly)) print('Found this many points within the Geom:', len(pts_in_this_poly) ) count = count + len(pts_in_this_poly) clear_output(wait=True) # Add the number of points for each poly to the dataframe. polygons['pointsinpolygon'] = gpd.GeoSeries(pts_in_polys) print( 'Total Points: ', total ) print( 'Total Points in Polygons: ', count ) print( 'Prcnt Points in Polygons: ', count / total ) return polygons

File Access Convenience Functions

# Find Relative Path to Files def findFile(root, file): for d, subD, f in os.walk(root): if file in f: return "{1}/{0}".format(file, d) break # To 'import' a script you wrote, map its filepath into the sys def addPath(root, file): sys.path.append(os.path.abspath( findFile( './', file) ))findFile('./', 'Routing September 2019.geojson')'.//Routing September 2019.geojson'ls leftRouts.csv 'Routing November 2019.geojson' rightRouts.csv 'Routing October 2019.geojson' 'Routing August 2019.geojson' 'Routing September 2019.geojson' 'Routing December 2019.geojson'

Inspect Deployment Data

ls 'Trip origins-destinations by month''Trip Destinations by block August 2019.geojson' 'Trip Destinations by block December 2019.geojson' 'Trip Destinations by block November 2019.geojson' 'Trip Destinations by block October 2019.geojson' 'Trip Destinations by block September 2019.geojson' 'Trip Origins by block August 2019.geojson' 'Trip Origins by block December 2019.geojson' 'Trip Origins by block November 2019.geojson' 'Trip Origins by block October 2019.geojson' 'Trip Origins by block September 2019.geojson' #@title Example form fields #@markdown Forms support many types of fields. fileName = "Trip Origins by block September 2019.geojson" #@param ['Daily Deployment average by block August 2019.csv', 'Daily Deployment average by block December 2019.csv', 'Daily Deployment average by block November 2019.csv', 'Daily Deployment average by block October 2019.csv', 'Daily Deployment average by block September 2019.csv', 'Trip Destinations by block August 2019.geojson','Trip Destinations by block December 2019.geojson','Trip Destinations by block November 2019.geojson', 'Trip Destinations by block October 2019.geojson', 'Trip Destinations by block September 2019.geojson', 'Trip Origins by block August 2019.geojson','Trip Origins by block December 2019.geojson','Trip Origins by block November 2019.geojson','Trip Origins by block October 2019.geojson','Trip Origins by block September 2019.geojson'] #@markdown ---gdf = gpd.read_file( findFile('./', fileName) ) gdf.head()
id name color radius value geometry
0 1 Block 245102502063018 #fff 4.999286 NaN POLYGON ((-76.66168 39.26342, -76.66136 39.26373, -76.66096 39.26343, -76.66041 39.26306, -76.66053 39.26296, -76.66070 39.26281, -76.66077 39.26284, -76.66100 39.26296, -76.66117 39.26306, -76.66129 39.26315, -76.66141 39.26322, -76.66168 39.26342))
1 2 Block 245102006001022 #fff 4.999286 NaN POLYGON ((-76.66319 39.28427, -76.66306 39.28436, -76.66301 39.28439, -76.66293 39.28439, -76.66271 39.28418, -76.66303 39.28402, -76.66317 39.28417, -76.66320 39.28423, -76.66319 39.28427))
2 3 Block 245102804033017 #fff 4.999286 NaN POLYGON ((-76.71122 39.27927, -76.71121 39.27975, -76.71121 39.27990, -76.71119 39.28076, -76.71045 39.27932, -76.71045 39.27920, -76.71056 39.27907, -76.71123 39.27880, -76.71122 39.27927))
3 4 Block 245102716006003 #fff 4.999286 NaN POLYGON ((-76.66960 39.34512, -76.66905 39.34526, -76.66886 39.34531, -76.66848 39.34539, -76.66835 39.34502, -76.66867 39.34498, -76.66924 39.34484, -76.66946 39.34475, -76.66960 39.34512))
4 5 Block 245102709023013 #fff 4.999286 NaN POLYGON ((-76.58976 39.35401, -76.58972 39.35415, -76.58966 39.35435, -76.58964 39.35447, -76.58960 39.35469, -76.58940 39.35467, -76.58925 39.35465, -76.58912 39.35465, -76.58902 39.35464, -76.58898 39.35464, -76.58906 39.35425, -76.58917 39.35385, -76.58925 39.35385, -76.58939 39.35386, -76.58979 39.35393, -76.58976 39.35401))
gdf.plot(column = 'value')
point_df = gdf.copy() point_df['centroids'] = df.centroid point_df = point_df.drop(columns = 'geometry') point_df = point_df.set_geometry('centroids') point_df.head(1) point_df.plot(marker='o', color='red', markersize='value')point_df.value.hist()point_df[point_df.value > 1000].plot()baltimore = gpd.read_file("https://opendata.arcgis.com/datasets/b738a8587b6d479a8824d937892701d8_0.geojson")from dataplay.geoms import workWithGeometryDatabaltimore.columnsIndex(['OBJECTID', 'CSA2010', 'tpop10', 'male10', 'female10', 'paa17', 'pwhite17', 'pasi17', 'p2more17', 'ppac17', 'phisp17', 'racdiv17', 'age5_17', 'age18_17', 'age24_17', 'age64_17', 'age65_17', 'hhs10', 'femhhs17', 'fam17', 'hhsize10', 'mhhi17', 'hh25inc17', 'hh40inc17', 'hh60inc17', 'hh75inc17', 'hhm7517', 'hhpov17', 'hhchpov17', 'Shape__Area', 'Shape__Length', 'geometry'], dtype='object')pointsInPolys = workWithGeometryData(method = 'ponp', df = point_df, polys = baltimore, ptsCoordCol ='centroids', polygonsCoordCol = 'geometry', polygonsLabel = 'CSA2010') pointsInPolys.plot(column='value', legend=True, markersize = 1)Total Points: 13506.0 Total Points in Polygons: 13453 Prcnt Points in Polygons: 0.9960758181548941
ponp_copy = pointsInPolys.copy()ponp_copy.value.isnull().groupby([ponp_copy['CSA2010']]).sum().astype(int).reset_index(name='NumberMissingCount').head()
CSA2010 NumberMissingCount
0 Allendale/Irvington/S. Hilton 290
1 Beechfield/Ten Hills/West Hills 180
2 Belair-Edison 250
3 Brooklyn/Curtis Bay/Hawkins Point 580
4 Canton 0
ponp_copy.value.notnull().groupby([ponp_copy['CSA2010']]).sum().astype(int) / ponp_copy.value.isnull().groupby([ponp_copy['CSA2010']]).sum().astype(int)ponp_copy.value.isnull().groupby([ponp_copy['CSA2010']]).sum().astype(int) / ponp_copy.value.groupby([ponp_copy['CSA2010']]).sum().astype(int) * 100ponp_copy.fillna(-1).groupby('CSA2010')['value'].sum()gdf.value.unique()gdf.value.describe()count 5677.000000 mean 40.862603 std 175.769408 min 1.000000 25% 2.000000 50% 5.000000 75% 21.000000 max 6978.000000 Name: value, dtype: float64gdf.value.var() # Return unbiased variance over requested axis.30894.884924369213gdf.value.sem() # Return unbiased standard error of the mean over requested axis.2.332834040372481gdf.value.nunique() # Count distinct observations over requested axis.364# DataFrame.shape Return a tuple representing the dimensionality of the DataFrame. gdf.shape #DataFrame.size Return an int representing the number of elements in this object. gdf.size # DataFrame.ndim Return an integer representing the number of axes/array dimensions. gdf.ndim # Note Used : # DataFrame.axes Return a list representing the axes of the DataFrame. gdf.dtypes # Return unbiased kurtosis over requested axis using Fisher’s definition of kurtosis (kurtosis of normal == 0.0). gdf.kurtosis()(13506, 6)810362id object name object color object radius float64 value float64 geometry geometry dtype: objectid -1.200000 radius 1132.851562 value 495.784213 dtype: float64

Load Tracts

# This will just beautify the output. pd.set_option('display.expand_frame_repr', False) pd.set_option('display.precision', 2) from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" pd.set_option('max_colwidth', 20)# The attributes are what we will use. in_crs = 2248 # The CRS we recieve our data. out_crs = 4326 # The CRS we would like to have our data represented as. geom = 'geometry' # The column where our spatial information lives. # A Url to load boundariesBaltimoreTractsNoWater2010 = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQ8xXdUaT17jkdK0MWTJpg3GOy6jMWeaXTlguXNjCSb8Vr_FanSZQRaTU-m811fQz4kyMFK5wcahMNY/pub?gid=886223646&single=true&output=csv" # Read in the dataframe gdf = pd.read_csv(boundariesBaltimoreTractsNoWater2010) # Convert the geometry column datatype from a string of text into a coordinate datatype. gdf['geometry'] = gdf['geometry'].apply(lambda x: loads( str(x) )) # Process the dataframe as a geodataframe with a known CRS and geom column. gdf = GeoDataFrame(gdf, crs=in_crs, geometry='geometry')boundariesBaltimoreTractsNoWater2010.head()

Ensure merge is on consistent datatypes

gdf['GEOID10'] = gdf['GEOID10'].astype(str) scooterdf['nameChange2'] = scooterdf['nameChange2'].astype(str)

Perform the merge

scooterdfClean = gdf.merge(scooterdf, left_on='GEOID10', right_on='nameChange2').drop(['name', 'nameChange1', 'nameChange2'], axis=1)scooterdfClean.head()# scooterdf.to_csv('./scooterdf.csv', index=False) # gdf.drop(columns='geometry').to_csv('./boundsdf.csv', index=False)lsscooterdfClean.groupby('CSA')['value'].sum()scooterdfClean.value.isna().groupby([scooterdfClean['CSA']]).sum().astype(int).reset_index(name='notApplicable')scooterdfClean.value.notnull().groupby([scooterdfClean['CSA']]).sum().astype(int).reset_index(name='NotMissingCount')scooterdfClean.value.isnull().groupby([scooterdfClean['CSA']]).sum().astype(int).reset_index(name='NumberMissingCount')scooterdfClean.fillna(-1).groupby('CSA')['value'].sum()scooterdfClean.groupby('CSA')['value'].mean()scooterdfClean.groupby('CSA')['value'].sum()scooterdfClean.CSA.value_counts()https://pandas.pydata.org/docs/getting_started/intro_tutorials/06_calculate_statistics.html

Inspect Routes Data

fileName = 'Routing December 2019.geojson' #@param ['Routing August 2019.geojson', 'Routing October 2019.geojson', 'Routing December 2019.geojson', 'Routing September 2019.geojson', 'Routing November 2019.geojson'] columnName = "streetname" #@param ['id', 'color', 'streetname', 'trip_count_sum', 'trip_count_average', 'trip_count_percent'] gdf = gpd.read_file( findFile('./', fileName) ) gdf.head()
id color streetname trip_count_sum trip_count_average trip_count_percent geometry
0 150197772 rgb(218, 231, 241) Jefferson Street 24 0.774194 0.0% LINESTRING (-76.58576 39.29660, -76.58550 39.29661)
1 150155955 rgb(164, 190, 219) 206 6.645161 0.3% LINESTRING (-76.61320 39.28136, -76.61318 39.28115)
2 150191673 rgb(204, 221, 236) Harford Avenue 49 1.580645 0.1% LINESTRING (-76.60169 39.30535, -76.60163 39.30544, -76.60156 39.30555)
3 150184657 rgb(229, 239, 246) 11 0.354839 0.0% LINESTRING (-76.61493 39.29294, -76.61491 39.29258, -76.61490 39.29230)
4 150188407 rgb(169, 194, 221) 178 5.741935 0.2% LINESTRING (-76.61662 39.29053, -76.61661 39.29046)
baltimore = gpd.read_file("https://opendata.arcgis.com/datasets/b738a8587b6d479a8824d937892701d8_0.geojson")tst = """background = alt.Chart(gdf).mark_geoshape( stroke='white', strokeWidth=2 ).encode( color=alt.value('#eee'), ).properties( width=700, height=500 )""" # GeoDataFrame could be passed as usual pd.DataFrame city = alt.Chart(baltimore).mark_geoshape( ).project( ).encode( color='tpop10', # shorthand infer types as for regular pd.DataFrame tooltip='CSA2010' # GeoDataFrame.index is accessible as id ).properties( width=500, height=300 ) routes = alt.Chart(gdf).mark_geoshape( filled=False, strokeWidth=2 ) city + routes

Clean the gdf of empties.

gdf = gdf[~gdf.isna()] gdf = gdf[~gdf.is_empty]

Now get the extremities; this will take a minute.

# hide_output gdf['lefty'], gdf['leftx'],gdf['righty'], gdf['rightx'] = zip(*gdf["geometry"].map(split))

Split the gdf into a left and right dataset

gdf_left = gdf.copy() gdf_left = gdf_left.drop(columns = ['geometry','rightx', 'righty']) gdf_right= gdf.copy() gdf_right = gdf_right.drop(columns = ['geometry','leftx', 'lefty', 'streetname', 'trip_count_sum', 'trip_count_average', 'trip_count_percent', 'color' ])

The coordinate variables of object type will cause problems.

gdf_right.dtypesid int64 righty float64 rightx object dtype: object

Let's go ahead and coerce a correction.

gdf_right['righty']=pd.to_numeric(gdf_right['righty'], errors='coerce') gdf_left['lefty']=pd.to_numeric(gdf_left['lefty'], errors='coerce') gdf_right['rightx']=pd.to_numeric(gdf_right['rightx'], errors='coerce') gdf_left['leftx']=pd.to_numeric(gdf_left['leftx'], errors='coerce')

Now we can view the results

gdf_right.dtypesid int64 righty float64 rightx float64 dtype: object

Save these csv's because it took a minute to get to where we are now.

gdf_right.to_csv('rightRouts.csv') gdf_left.to_csv('leftRouts.csv')

Convert the datasets to geodataframes for further exploration!

# We could create a geodataframe like this #temp_gdf = gpd.GeoDataFrame( gdf_right, geometry=gpd.points_from_xy(gdf_right.rightx, gdf_right.righty) ) #temp_gdf.head() # Alternately this could work.. unfinished.. but wkt.loads can make a Point from text # gdf_right['strCol']=gdf_right['rightx'].astype(str) # gdf_right['geometry'] = gdf_right['strCol'].apply(wkt.loads)# hide_output left_csaMap = readInGeometryData(url=gdf_left, porg='p', geom= False, lat= 'leftx', lng= 'lefty', revgeocode=False, save=False, in_crs=4268, out_crs=4268)# hide_output right_csaMap = readInGeometryData(url=gdf_right, porg='p', geom= False, lat= 'rightx', lng= 'righty', revgeocode=False, save=False, in_crs=4268, out_crs=4268)right_csaMap.head()
id righty rightx geometry
0 150197772 -76.585503 39.296607 POINT (-76.58550 39.29661)
1 150155955 -76.613183 39.281147 POINT (-76.61318 39.28115)
2 150191673 -76.601555 39.305545 POINT (-76.60156 39.30555)
3 150184657 -76.614899 39.292301 POINT (-76.61490 39.29230)
4 150188407 -76.616608 39.290458 POINT (-76.61661 39.29046)
left_csaMap.head()left_csaMap.plot(column='trip_count_sum')right_csaMap.plot()baltimore.columns# hide_output right_points_in_poly = getPointsInPolygons(right_csaMap, baltimore, 'geometry', 'geometry')right_csaMap.head()left_points_in_poly = getPointsInPolygons(left_csaMap, baltimore, 'geometry', 'geometry')left_csaMap.head()left_points_in_poly.drop('geometry', axis=1)[['pointsinpolygon']].head(20)
pointsinpolygon
0 18
1 0
2 67
3 0
4 1536
5 18
6 32
7 8
8 1
9 65
10 0
11 0
12 13
13 1764
14 0
15 760
16 0
17 85
18 1253
19 2
right_points_in_poly.drop('geometry', axis=1)[['pointsinpolygon']].head(20)
pointsinpolygon
0 18
1 0
2 67
3 0
4 1536
5 18
6 32
7 8
8 1
9 65
10 0
11 0
12 13
13 1764
14 0
15 760
16 0
17 85
18 1253
19 2
left_points_in_poly.plot(column='pointsinpolygon', legend=True)
right_points_in_poly.plot(column='pointsinpolygon', legend=True)
right_points_in_polyscooterdfClean = left_points_in_poly.merge(right_points_in_poly, left_on='CSA2010', right_on='CSA2010', how='left')scooterdfClean.columnsright_points_in_poly.columnsIndex(['OBJECTID', 'CSA2010', 'tpop10', 'male10', 'female10', 'paa17', 'pwhite17', 'pasi17', 'p2more17', 'ppac17', 'phisp17', 'racdiv17', 'age5_17', 'age18_17', 'age24_17', 'age64_17', 'age65_17', 'hhs10', 'femhhs17', 'fam17', 'hhsize10', 'mhhi17', 'hh25inc17', 'hh40inc17', 'hh60inc17', 'hh75inc17', 'hhm7517', 'hhpov17', 'hhchpov17', 'Shape__Area', 'Shape__Length', 'geometry', 'pointsinpolygon'], dtype='object')right_points_in_poly.head()right_points_in_poly.plot('pointsinpolygon', legend=True)
import altair as alt import geopandas as gpd import gpdvega # GeoDataFrame could be passed as usual pd.DataFrame chart = alt.Chart(right_points_in_poly).mark_geoshape( ).project( ).encode( color='pointsinpolygon', # shorthand infer types as for regular pd.DataFrame tooltip=['CSA2010','pointsinpolygon'] # GeoDataFrame.index is accessible as id ).properties( width=500, height=300 ) routes = alt.Chart(gdf).mark_geoshape( filled=False, strokeWidth=2 ) chart + routes chart.save(fileName[:-8]+'.html', embed_options={'renderer':'svg'}) chart.save(fileName[:-8]+'.json')

Inspect Origins-Destinations Data

ls 'Trip origins-destinations by month'# @title Example form fields #@markdown Forms support many types of fields. fileName = 'Trip Destinations by block August 2019.geojson' #@param ['Trip Destinations by block August 2019.geojson', 'Trip Destinations by block December 2019.geojson', 'Trip Destinations by block November 2019.geojson', 'Trip Destinations by block October 2019.geojson', 'Trip Destinations by block September 2019.geojson', 'Trip Origins by block August 2019.geojson', 'Trip Origins by block December 2019.geojson', 'Trip Origins by block November 2019.geojson', 'Trip Origins by block October 2019.geojson', 'Trip Origins by block September 2019.geojson'] columnName = "name" #@param ['name', 'value', 'color', 'radius'] #@markdown --- gdf = gpd.read_file( findFile('./', fileName) ) gdf.plot( column = columnName) gdf.columns gdf[['id','name', 'value', 'color', 'radius']].head(5)dxp.bar(x='color', y='value', data=gdf, aggfunc='median')dxp.scatter(x = "color", y = "value", data = gdf, aggfunc = "median")#hide !pip install nbdev from nbdev import * ! nbdev_upgrade

Introduction

Binder Binder Binder Open Source Love svg3

NPM License Active Python Versions GitHub last commit No Maintenance Intended

GitHub stars GitHub watchers GitHub forks GitHub followers

Tweet Twitter Follow

This chapter has brought to you in collaboration by Brian Kelly, Michael Vandi, Logan Shertz, and Charles Karpati.

Dataset: Scooter data:

  • Routes: 3 months (September to August 2019)
  • Deployment/
  • Routes
  • Trip origins-destinations by month
# do you see this? - Carlos# Hello world! - Michael# Checking - Brian

Local File Access (Optional)

# hide_output # (Optional) Run this cell to gain access to Google Drive (Colabs only) from google.colab import drive # Colabs operates in a virtualized enviornment # Colabs default directory is at ~/content. # We mount Drive into a temporary folder at '~/content/drive' drive.mount('/content/drive')Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code Enter your authorization code: ·········· Mounted at /content/drive

File path's will vary

# cd ./drive/'My Drive'/BNIA/responsive_records/Routes# cd ../content/drive/My Drive/DATA/scooter/content/drive/My Drive/DATA/scooter # Michael's Directory # cd ./drive/'My Drive'/BNIA/'Scooter Use Data'/BNIAcd ./Routes/content/drive/My Drive/DATA/scooter/Routes # hide_output ! ls leftRouts.csv 'Routing November 2019.geojson' rightRouts.csv 'Routing October 2019.geojson' 'Routing August 2019.geojson' 'Routing September 2019.geojson' 'Routing December 2019.geojson' # hide_output !cd ../ && ls boundsdf.csv rightRouts.csv 'Trip origins-destinations by month' Deployment Routes leftRouts.csv scooterdf.csv

Installs

! pip install dexplot !pip install folium !pip install geopandas !pip install ipyleaflet !pip install gpdvega !pip install dataplay

Imports

import os import pandas as pd import geopandas as gpd import dexplot as dxp import folium as fol import json import altair as alt import gpdvega import dataplay # These imports will handle everything import os import sys import csv from IPython.display import clear_output import matplotlib.pyplot as plt import numpy as np import pandas as pd import geopandas as gpd from geopandas import GeoDataFrame import psycopg2 import pyproj from pyproj import Proj, transform # conda install -c conda-forge proj4 from shapely.geometry import Point from shapely import wkb from shapely.wkt import loads # https://pypi.org/project/geopy/ from geopy.geocoders import Nominatim import folium from folium import plugins from dataplay.merge import mergeDatasets import dexplot as dxp # In case file is KML, enable support import fiona fiona.drvsupport.supported_drivers['kml'] = 'rw' fiona.drvsupport.supported_drivers['KML'] = 'rw' #this cell is good to copy from shapely.geometry import LineString pd.plotting.register_matplotlib_converters() from dataplay.geoms import readInGeometryData from shapely import wkt from dataplay.geoms import readInGeometryData import ipywidgets as widgets !jupyter nbextension enable --py widgetsnbextension from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = 'all' import ipywidgets as widgets from ipywidgets import interact, interact_manual alt.data_transformers.enable('default', max_rows=None)

Convenience Functions

# Return the boundaries of a Linestring def split(geom): print( str( type(geom)) ) #Linestring might not be the actual type, so this may need to be fied. #IF its not a linestring then dont run it and stuff 'false' and the datatype if str( type(geom)) == "" and not str(geom.boundary) == 'MULTIPOINT EMPTY': print(geom.boundary) left, right = geom.boundary print('left x: ', type(left.x), left.x) print('left y: ', type(left.y), left.y) print('right x: ', type(right.x), right.x) print('right y: ', type(right.y), right.y) return left.x, left.y, right.x, right.y else: return False, type(geom), False, type(geom)# To 'import' a script you wrote, map its filepath into the sys def getPointsInPolygons(pts, polygons, ptsCoordCol, polygonsCoordCol): count = 0 total = pts.size / len(pts.columns) # We're going to keep a list of how many points we find. pts_in_polys = [] # Loop over polygons with index i. for i, poly in polygons.iterrows(): # print('Searching for point within Geom:', poly ) # Keep a list of points in this poly pts_in_this_poly = [] # Now loop over all points with index j. for j, pt in pts.iterrows(): if poly[polygonsCoordCol].contains(pt[ptsCoordCol]): # Then it's a hit! Add it to the list, # and drop it so we have less hunting. pts_in_this_poly.append(pt[ptsCoordCol]) pts = pts.drop([j]) # We could do all sorts, like grab a property of the # points, but let's just append the number of them. pts_in_polys.append(len(pts_in_this_poly)) print('Found this many points within the Geom:', len(pts_in_this_poly) ) count = count + len(pts_in_this_poly) clear_output(wait=True) # Add the number of points for each poly to the dataframe. polygons['pointsinpolygon'] = gpd.GeoSeries(pts_in_polys) print( 'Total Points: ', total ) print( 'Total Points in Polygons: ', count ) print( 'Prcnt Points in Polygons: ', count / total ) return polygons

File Access Convenience Functions

# Find Relative Path to Files def findFile(root, file): for d, subD, f in os.walk(root): if file in f: return "{1}/{0}".format(file, d) break # To 'import' a script you wrote, map its filepath into the sys def addPath(root, file): sys.path.append(os.path.abspath( findFile( './', file) ))findFile('./', 'Routing September 2019.geojson')'.//Routing September 2019.geojson'ls leftRouts.csv 'Routing November 2019.geojson' rightRouts.csv 'Routing October 2019.geojson' 'Routing August 2019.geojson' 'Routing September 2019.geojson' 'Routing December 2019.geojson'

Inspect Deployment Data

ls 'Trip origins-destinations by month''Trip Destinations by block August 2019.geojson' 'Trip Destinations by block December 2019.geojson' 'Trip Destinations by block November 2019.geojson' 'Trip Destinations by block October 2019.geojson' 'Trip Destinations by block September 2019.geojson' 'Trip Origins by block August 2019.geojson' 'Trip Origins by block December 2019.geojson' 'Trip Origins by block November 2019.geojson' 'Trip Origins by block October 2019.geojson' 'Trip Origins by block September 2019.geojson' #@title Example form fields #@markdown Forms support many types of fields. fileName = "Trip Origins by block September 2019.geojson" #@param ['Daily Deployment average by block August 2019.csv', 'Daily Deployment average by block December 2019.csv', 'Daily Deployment average by block November 2019.csv', 'Daily Deployment average by block October 2019.csv', 'Daily Deployment average by block September 2019.csv', 'Trip Destinations by block August 2019.geojson','Trip Destinations by block December 2019.geojson','Trip Destinations by block November 2019.geojson', 'Trip Destinations by block October 2019.geojson', 'Trip Destinations by block September 2019.geojson', 'Trip Origins by block August 2019.geojson','Trip Origins by block December 2019.geojson','Trip Origins by block November 2019.geojson','Trip Origins by block October 2019.geojson','Trip Origins by block September 2019.geojson'] #@markdown ---gdf = gpd.read_file( findFile('./', fileName) ) gdf.head()
id name color radius value geometry
0 1 Block 245102502063018 #fff 4.999286 NaN POLYGON ((-76.66168 39.26342, -76.66136 39.26373, -76.66096 39.26343, -76.66041 39.26306, -76.66053 39.26296, -76.66070 39.26281, -76.66077 39.26284, -76.66100 39.26296, -76.66117 39.26306, -76.66129 39.26315, -76.66141 39.26322, -76.66168 39.26342))
1 2 Block 245102006001022 #fff 4.999286 NaN POLYGON ((-76.66319 39.28427, -76.66306 39.28436, -76.66301 39.28439, -76.66293 39.28439, -76.66271 39.28418, -76.66303 39.28402, -76.66317 39.28417, -76.66320 39.28423, -76.66319 39.28427))
2 3 Block 245102804033017 #fff 4.999286 NaN POLYGON ((-76.71122 39.27927, -76.71121 39.27975, -76.71121 39.27990, -76.71119 39.28076, -76.71045 39.27932, -76.71045 39.27920, -76.71056 39.27907, -76.71123 39.27880, -76.71122 39.27927))
3 4 Block 245102716006003 #fff 4.999286 NaN POLYGON ((-76.66960 39.34512, -76.66905 39.34526, -76.66886 39.34531, -76.66848 39.34539, -76.66835 39.34502, -76.66867 39.34498, -76.66924 39.34484, -76.66946 39.34475, -76.66960 39.34512))
4 5 Block 245102709023013 #fff 4.999286 NaN POLYGON ((-76.58976 39.35401, -76.58972 39.35415, -76.58966 39.35435, -76.58964 39.35447, -76.58960 39.35469, -76.58940 39.35467, -76.58925 39.35465, -76.58912 39.35465, -76.58902 39.35464, -76.58898 39.35464, -76.58906 39.35425, -76.58917 39.35385, -76.58925 39.35385, -76.58939 39.35386, -76.58979 39.35393, -76.58976 39.35401))
gdf.plot(column = 'value')
point_df = gdf.copy() point_df['centroids'] = df.centroid point_df = point_df.drop(columns = 'geometry') point_df = point_df.set_geometry('centroids') point_df.head(1) point_df.plot(marker='o', color='red', markersize='value')point_df.value.hist()point_df[point_df.value > 1000].plot()baltimore = gpd.read_file("https://opendata.arcgis.com/datasets/b738a8587b6d479a8824d937892701d8_0.geojson")from dataplay.geoms import workWithGeometryDatabaltimore.columnsIndex(['OBJECTID', 'CSA2010', 'tpop10', 'male10', 'female10', 'paa17', 'pwhite17', 'pasi17', 'p2more17', 'ppac17', 'phisp17', 'racdiv17', 'age5_17', 'age18_17', 'age24_17', 'age64_17', 'age65_17', 'hhs10', 'femhhs17', 'fam17', 'hhsize10', 'mhhi17', 'hh25inc17', 'hh40inc17', 'hh60inc17', 'hh75inc17', 'hhm7517', 'hhpov17', 'hhchpov17', 'Shape__Area', 'Shape__Length', 'geometry'], dtype='object')pointsInPolys = workWithGeometryData(method = 'ponp', df = point_df, polys = baltimore, ptsCoordCol ='centroids', polygonsCoordCol = 'geometry', polygonsLabel = 'CSA2010') pointsInPolys.plot(column='value', legend=True, markersize = 1)Total Points: 13506.0 Total Points in Polygons: 13453 Prcnt Points in Polygons: 0.9960758181548941
ponp_copy = pointsInPolys.copy()ponp_copy.value.isnull().groupby([ponp_copy['CSA2010']]).sum().astype(int).reset_index(name='NumberMissingCount').head()
CSA2010 NumberMissingCount
0 Allendale/Irvington/S. Hilton 290
1 Beechfield/Ten Hills/West Hills 180
2 Belair-Edison 250
3 Brooklyn/Curtis Bay/Hawkins Point 580
4 Canton 0
ponp_copy.value.notnull().groupby([ponp_copy['CSA2010']]).sum().astype(int) / ponp_copy.value.isnull().groupby([ponp_copy['CSA2010']]).sum().astype(int)ponp_copy.value.isnull().groupby([ponp_copy['CSA2010']]).sum().astype(int) / ponp_copy.value.groupby([ponp_copy['CSA2010']]).sum().astype(int) * 100ponp_copy.fillna(-1).groupby('CSA2010')['value'].sum()gdf.value.unique()gdf.value.describe()count 5677.000000 mean 40.862603 std 175.769408 min 1.000000 25% 2.000000 50% 5.000000 75% 21.000000 max 6978.000000 Name: value, dtype: float64gdf.value.var() # Return unbiased variance over requested axis.30894.884924369213gdf.value.sem() # Return unbiased standard error of the mean over requested axis.2.332834040372481gdf.value.nunique() # Count distinct observations over requested axis.364# DataFrame.shape Return a tuple representing the dimensionality of the DataFrame. gdf.shape #DataFrame.size Return an int representing the number of elements in this object. gdf.size # DataFrame.ndim Return an integer representing the number of axes/array dimensions. gdf.ndim # Note Used : # DataFrame.axes Return a list representing the axes of the DataFrame. gdf.dtypes # Return unbiased kurtosis over requested axis using Fisher’s definition of kurtosis (kurtosis of normal == 0.0). gdf.kurtosis()(13506, 6)810362id object name object color object radius float64 value float64 geometry geometry dtype: objectid -1.200000 radius 1132.851562 value 495.784213 dtype: float64

Load Tracts

# This will just beautify the output. pd.set_option('display.expand_frame_repr', False) pd.set_option('display.precision', 2) from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" pd.set_option('max_colwidth', 20)# The attributes are what we will use. in_crs = 2248 # The CRS we recieve our data. out_crs = 4326 # The CRS we would like to have our data represented as. geom = 'geometry' # The column where our spatial information lives. # A Url to load boundariesBaltimoreTractsNoWater2010 = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQ8xXdUaT17jkdK0MWTJpg3GOy6jMWeaXTlguXNjCSb8Vr_FanSZQRaTU-m811fQz4kyMFK5wcahMNY/pub?gid=886223646&single=true&output=csv" # Read in the dataframe gdf = pd.read_csv(boundariesBaltimoreTractsNoWater2010) # Convert the geometry column datatype from a string of text into a coordinate datatype. gdf['geometry'] = gdf['geometry'].apply(lambda x: loads( str(x) )) # Process the dataframe as a geodataframe with a known CRS and geom column. gdf = GeoDataFrame(gdf, crs=in_crs, geometry='geometry')boundariesBaltimoreTractsNoWater2010.head()

Ensure merge is on consistent datatypes

gdf['GEOID10'] = gdf['GEOID10'].astype(str) scooterdf['nameChange2'] = scooterdf['nameChange2'].astype(str)

Perform the merge

scooterdfClean = gdf.merge(scooterdf, left_on='GEOID10', right_on='nameChange2').drop(['name', 'nameChange1', 'nameChange2'], axis=1)scooterdfClean.head()# scooterdf.to_csv('./scooterdf.csv', index=False) # gdf.drop(columns='geometry').to_csv('./boundsdf.csv', index=False)lsscooterdfClean.groupby('CSA')['value'].sum()scooterdfClean.value.isna().groupby([scooterdfClean['CSA']]).sum().astype(int).reset_index(name='notApplicable')scooterdfClean.value.notnull().groupby([scooterdfClean['CSA']]).sum().astype(int).reset_index(name='NotMissingCount')scooterdfClean.value.isnull().groupby([scooterdfClean['CSA']]).sum().astype(int).reset_index(name='NumberMissingCount')scooterdfClean.fillna(-1).groupby('CSA')['value'].sum()scooterdfClean.groupby('CSA')['value'].mean()scooterdfClean.groupby('CSA')['value'].sum()scooterdfClean.CSA.value_counts()https://pandas.pydata.org/docs/getting_started/intro_tutorials/06_calculate_statistics.html

Inspect Routes Data

fileName = 'Routing December 2019.geojson' #@param ['Routing August 2019.geojson', 'Routing October 2019.geojson', 'Routing December 2019.geojson', 'Routing September 2019.geojson', 'Routing November 2019.geojson'] columnName = "streetname" #@param ['id', 'color', 'streetname', 'trip_count_sum', 'trip_count_average', 'trip_count_percent'] gdf = gpd.read_file( findFile('./', fileName) ) gdf.head()
id color streetname trip_count_sum trip_count_average trip_count_percent geometry
0 150197772 rgb(218, 231, 241) Jefferson Street 24 0.774194 0.0% LINESTRING (-76.58576 39.29660, -76.58550 39.29661)
1 150155955 rgb(164, 190, 219) 206 6.645161 0.3% LINESTRING (-76.61320 39.28136, -76.61318 39.28115)
2 150191673 rgb(204, 221, 236) Harford Avenue 49 1.580645 0.1% LINESTRING (-76.60169 39.30535, -76.60163 39.30544, -76.60156 39.30555)
3 150184657 rgb(229, 239, 246) 11 0.354839 0.0% LINESTRING (-76.61493 39.29294, -76.61491 39.29258, -76.61490 39.29230)
4 150188407 rgb(169, 194, 221) 178 5.741935 0.2% LINESTRING (-76.61662 39.29053, -76.61661 39.29046)
baltimore = gpd.read_file("https://opendata.arcgis.com/datasets/b738a8587b6d479a8824d937892701d8_0.geojson")tst = """background = alt.Chart(gdf).mark_geoshape( stroke='white', strokeWidth=2 ).encode( color=alt.value('#eee'), ).properties( width=700, height=500 )""" # GeoDataFrame could be passed as usual pd.DataFrame city = alt.Chart(baltimore).mark_geoshape( ).project( ).encode( color='tpop10', # shorthand infer types as for regular pd.DataFrame tooltip='CSA2010' # GeoDataFrame.index is accessible as id ).properties( width=500, height=300 ) routes = alt.Chart(gdf).mark_geoshape( filled=False, strokeWidth=2 ) city + routes

Clean the gdf of empties.

gdf = gdf[~gdf.isna()] gdf = gdf[~gdf.is_empty]

Now get the extremities; this will take a minute.

# hide_output gdf['lefty'], gdf['leftx'],gdf['righty'], gdf['rightx'] = zip(*gdf["geometry"].map(split))

Split the gdf into a left and right dataset

gdf_left = gdf.copy() gdf_left = gdf_left.drop(columns = ['geometry','rightx', 'righty']) gdf_right= gdf.copy() gdf_right = gdf_right.drop(columns = ['geometry','leftx', 'lefty', 'streetname', 'trip_count_sum', 'trip_count_average', 'trip_count_percent', 'color' ])

The coordinate variables of object type will cause problems.

gdf_right.dtypesid int64 righty float64 rightx object dtype: object

Let's go ahead and coerce a correction.

gdf_right['righty']=pd.to_numeric(gdf_right['righty'], errors='coerce') gdf_left['lefty']=pd.to_numeric(gdf_left['lefty'], errors='coerce') gdf_right['rightx']=pd.to_numeric(gdf_right['rightx'], errors='coerce') gdf_left['leftx']=pd.to_numeric(gdf_left['leftx'], errors='coerce')

Now we can view the results

gdf_right.dtypesid int64 righty float64 rightx float64 dtype: object

Save these csv's because it took a minute to get to where we are now.

gdf_right.to_csv('rightRouts.csv') gdf_left.to_csv('leftRouts.csv')

Convert the datasets to geodataframes for further exploration!

# We could create a geodataframe like this #temp_gdf = gpd.GeoDataFrame( gdf_right, geometry=gpd.points_from_xy(gdf_right.rightx, gdf_right.righty) ) #temp_gdf.head() # Alternately this could work.. unfinished.. but wkt.loads can make a Point from text # gdf_right['strCol']=gdf_right['rightx'].astype(str) # gdf_right['geometry'] = gdf_right['strCol'].apply(wkt.loads)# hide_output left_csaMap = readInGeometryData(url=gdf_left, porg='p', geom= False, lat= 'leftx', lng= 'lefty', revgeocode=False, save=False, in_crs=4268, out_crs=4268)# hide_output right_csaMap = readInGeometryData(url=gdf_right, porg='p', geom= False, lat= 'rightx', lng= 'righty', revgeocode=False, save=False, in_crs=4268, out_crs=4268)right_csaMap.head()
id righty rightx geometry
0 150197772 -76.585503 39.296607 POINT (-76.58550 39.29661)
1 150155955 -76.613183 39.281147 POINT (-76.61318 39.28115)
2 150191673 -76.601555 39.305545 POINT (-76.60156 39.30555)
3 150184657 -76.614899 39.292301 POINT (-76.61490 39.29230)
4 150188407 -76.616608 39.290458 POINT (-76.61661 39.29046)
left_csaMap.head()left_csaMap.plot(column='trip_count_sum')right_csaMap.plot()baltimore.columns# hide_output right_points_in_poly = getPointsInPolygons(right_csaMap, baltimore, 'geometry', 'geometry')right_csaMap.head()left_points_in_poly = getPointsInPolygons(left_csaMap, baltimore, 'geometry', 'geometry')left_csaMap.head()left_points_in_poly.drop('geometry', axis=1)[['pointsinpolygon']].head(20)
pointsinpolygon
0 18
1 0
2 67
3 0
4 1536
5 18
6 32
7 8
8 1
9 65
10 0
11 0
12 13
13 1764
14 0
15 760
16 0
17 85
18 1253
19 2
right_points_in_poly.drop('geometry', axis=1)[['pointsinpolygon']].head(20)
pointsinpolygon
0 18
1 0
2 67
3 0
4 1536
5 18
6 32
7 8
8 1
9 65
10 0
11 0
12 13
13 1764
14 0
15 760
16 0
17 85
18 1253
19 2
left_points_in_poly.plot(column='pointsinpolygon', legend=True)
right_points_in_poly.plot(column='pointsinpolygon', legend=True)
right_points_in_polyscooterdfClean = left_points_in_poly.merge(right_points_in_poly, left_on='CSA2010', right_on='CSA2010', how='left')scooterdfClean.columnsright_points_in_poly.columnsIndex(['OBJECTID', 'CSA2010', 'tpop10', 'male10', 'female10', 'paa17', 'pwhite17', 'pasi17', 'p2more17', 'ppac17', 'phisp17', 'racdiv17', 'age5_17', 'age18_17', 'age24_17', 'age64_17', 'age65_17', 'hhs10', 'femhhs17', 'fam17', 'hhsize10', 'mhhi17', 'hh25inc17', 'hh40inc17', 'hh60inc17', 'hh75inc17', 'hhm7517', 'hhpov17', 'hhchpov17', 'Shape__Area', 'Shape__Length', 'geometry', 'pointsinpolygon'], dtype='object')right_points_in_poly.head()right_points_in_poly.plot('pointsinpolygon', legend=True)
import altair as alt import geopandas as gpd import gpdvega # GeoDataFrame could be passed as usual pd.DataFrame chart = alt.Chart(right_points_in_poly).mark_geoshape( ).project( ).encode( color='pointsinpolygon', # shorthand infer types as for regular pd.DataFrame tooltip=['CSA2010','pointsinpolygon'] # GeoDataFrame.index is accessible as id ).properties( width=500, height=300 ) routes = alt.Chart(gdf).mark_geoshape( filled=False, strokeWidth=2 ) chart + routes chart.save(fileName[:-8]+'.html', embed_options={'renderer':'svg'}) chart.save(fileName[:-8]+'.json')

Inspect Origins-Destinations Data

ls 'Trip origins-destinations by month'# @title Example form fields #@markdown Forms support many types of fields. fileName = 'Trip Destinations by block August 2019.geojson' #@param ['Trip Destinations by block August 2019.geojson', 'Trip Destinations by block December 2019.geojson', 'Trip Destinations by block November 2019.geojson', 'Trip Destinations by block October 2019.geojson', 'Trip Destinations by block September 2019.geojson', 'Trip Origins by block August 2019.geojson', 'Trip Origins by block December 2019.geojson', 'Trip Origins by block November 2019.geojson', 'Trip Origins by block October 2019.geojson', 'Trip Origins by block September 2019.geojson'] columnName = "name" #@param ['name', 'value', 'color', 'radius'] #@markdown --- gdf = gpd.read_file( findFile('./', fileName) ) gdf.plot( column = columnName) gdf.columns gdf[['id','name', 'value', 'color', 'radius']].head(5)dxp.bar(x='color', y='value', data=gdf, aggfunc='median')dxp.scatter(x = "color", y = "value", data = gdf, aggfunc = "median")#hide !pip install nbdev from nbdev import * ! nbdev_upgrade

Introduction

Binder Binder Binder Open Source Love svg3

NPM License Active Python Versions GitHub last commit No Maintenance Intended

GitHub stars GitHub watchers GitHub forks GitHub followers

Tweet Twitter Follow

This chapter has brought to you in collaboration by Brian Kelly, Michael Vandi, Logan Shertz, and Charles Karpati.

Dataset: Scooter data:

  • Routes: 3 months (September to August 2019)
  • Deployment/
  • Routes
  • Trip origins-destinations by month
# do you see this? - Carlos# Hello world! - Michael# Checking - Brian

Local File Access (Optional)

# hide_output # (Optional) Run this cell to gain access to Google Drive (Colabs only) from google.colab import drive # Colabs operates in a virtualized enviornment # Colabs default directory is at ~/content. # We mount Drive into a temporary folder at '~/content/drive' drive.mount('/content/drive')Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code Enter your authorization code: ·········· Mounted at /content/drive

File path's will vary

# cd ./drive/'My Drive'/BNIA/responsive_records/Routes# cd ../content/drive/My Drive/DATA/scooter/content/drive/My Drive/DATA/scooter # Michael's Directory # cd ./drive/'My Drive'/BNIA/'Scooter Use Data'/BNIAcd ./Routes/content/drive/My Drive/DATA/scooter/Routes # hide_output ! ls leftRouts.csv 'Routing November 2019.geojson' rightRouts.csv 'Routing October 2019.geojson' 'Routing August 2019.geojson' 'Routing September 2019.geojson' 'Routing December 2019.geojson' # hide_output !cd ../ && ls boundsdf.csv rightRouts.csv 'Trip origins-destinations by month' Deployment Routes leftRouts.csv scooterdf.csv

Installs

! pip install dexplot !pip install folium !pip install geopandas !pip install ipyleaflet !pip install gpdvega !pip install dataplay

Imports

import os import pandas as pd import geopandas as gpd import dexplot as dxp import folium as fol import json import altair as alt import gpdvega import dataplay # These imports will handle everything import os import sys import csv from IPython.display import clear_output import matplotlib.pyplot as plt import numpy as np import pandas as pd import geopandas as gpd from geopandas import GeoDataFrame import psycopg2 import pyproj from pyproj import Proj, transform # conda install -c conda-forge proj4 from shapely.geometry import Point from shapely import wkb from shapely.wkt import loads # https://pypi.org/project/geopy/ from geopy.geocoders import Nominatim import folium from folium import plugins from dataplay.merge import mergeDatasets import dexplot as dxp # In case file is KML, enable support import fiona fiona.drvsupport.supported_drivers['kml'] = 'rw' fiona.drvsupport.supported_drivers['KML'] = 'rw' #this cell is good to copy from shapely.geometry import LineString pd.plotting.register_matplotlib_converters() from dataplay.geoms import readInGeometryData from shapely import wkt from dataplay.geoms import readInGeometryData import ipywidgets as widgets !jupyter nbextension enable --py widgetsnbextension from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = 'all' import ipywidgets as widgets from ipywidgets import interact, interact_manual alt.data_transformers.enable('default', max_rows=None)

Convenience Functions

# Return the boundaries of a Linestring def split(geom): print( str( type(geom)) ) #Linestring might not be the actual type, so this may need to be fied. #IF its not a linestring then dont run it and stuff 'false' and the datatype if str( type(geom)) == "" and not str(geom.boundary) == 'MULTIPOINT EMPTY': print(geom.boundary) left, right = geom.boundary print('left x: ', type(left.x), left.x) print('left y: ', type(left.y), left.y) print('right x: ', type(right.x), right.x) print('right y: ', type(right.y), right.y) return left.x, left.y, right.x, right.y else: return False, type(geom), False, type(geom)# To 'import' a script you wrote, map its filepath into the sys def getPointsInPolygons(pts, polygons, ptsCoordCol, polygonsCoordCol): count = 0 total = pts.size / len(pts.columns) # We're going to keep a list of how many points we find. pts_in_polys = [] # Loop over polygons with index i. for i, poly in polygons.iterrows(): # print('Searching for point within Geom:', poly ) # Keep a list of points in this poly pts_in_this_poly = [] # Now loop over all points with index j. for j, pt in pts.iterrows(): if poly[polygonsCoordCol].contains(pt[ptsCoordCol]): # Then it's a hit! Add it to the list, # and drop it so we have less hunting. pts_in_this_poly.append(pt[ptsCoordCol]) pts = pts.drop([j]) # We could do all sorts, like grab a property of the # points, but let's just append the number of them. pts_in_polys.append(len(pts_in_this_poly)) print('Found this many points within the Geom:', len(pts_in_this_poly) ) count = count + len(pts_in_this_poly) clear_output(wait=True) # Add the number of points for each poly to the dataframe. polygons['pointsinpolygon'] = gpd.GeoSeries(pts_in_polys) print( 'Total Points: ', total ) print( 'Total Points in Polygons: ', count ) print( 'Prcnt Points in Polygons: ', count / total ) return polygons

File Access Convenience Functions

# Find Relative Path to Files def findFile(root, file): for d, subD, f in os.walk(root): if file in f: return "{1}/{0}".format(file, d) break # To 'import' a script you wrote, map its filepath into the sys def addPath(root, file): sys.path.append(os.path.abspath( findFile( './', file) ))findFile('./', 'Routing September 2019.geojson')'.//Routing September 2019.geojson'ls leftRouts.csv 'Routing November 2019.geojson' rightRouts.csv 'Routing October 2019.geojson' 'Routing August 2019.geojson' 'Routing September 2019.geojson' 'Routing December 2019.geojson'

Inspect Deployment Data

ls 'Trip origins-destinations by month''Trip Destinations by block August 2019.geojson' 'Trip Destinations by block December 2019.geojson' 'Trip Destinations by block November 2019.geojson' 'Trip Destinations by block October 2019.geojson' 'Trip Destinations by block September 2019.geojson' 'Trip Origins by block August 2019.geojson' 'Trip Origins by block December 2019.geojson' 'Trip Origins by block November 2019.geojson' 'Trip Origins by block October 2019.geojson' 'Trip Origins by block September 2019.geojson' #@title Example form fields #@markdown Forms support many types of fields. fileName = "Trip Origins by block September 2019.geojson" #@param ['Daily Deployment average by block August 2019.csv', 'Daily Deployment average by block December 2019.csv', 'Daily Deployment average by block November 2019.csv', 'Daily Deployment average by block October 2019.csv', 'Daily Deployment average by block September 2019.csv', 'Trip Destinations by block August 2019.geojson','Trip Destinations by block December 2019.geojson','Trip Destinations by block November 2019.geojson', 'Trip Destinations by block October 2019.geojson', 'Trip Destinations by block September 2019.geojson', 'Trip Origins by block August 2019.geojson','Trip Origins by block December 2019.geojson','Trip Origins by block November 2019.geojson','Trip Origins by block October 2019.geojson','Trip Origins by block September 2019.geojson'] #@markdown ---gdf = gpd.read_file( findFile('./', fileName) ) gdf.head()
id name color radius value geometry
0 1 Block 245102502063018 #fff 4.999286 NaN POLYGON ((-76.66168 39.26342, -76.66136 39.26373, -76.66096 39.26343, -76.66041 39.26306, -76.66053 39.26296, -76.66070 39.26281, -76.66077 39.26284, -76.66100 39.26296, -76.66117 39.26306, -76.66129 39.26315, -76.66141 39.26322, -76.66168 39.26342))
1 2 Block 245102006001022 #fff 4.999286 NaN POLYGON ((-76.66319 39.28427, -76.66306 39.28436, -76.66301 39.28439, -76.66293 39.28439, -76.66271 39.28418, -76.66303 39.28402, -76.66317 39.28417, -76.66320 39.28423, -76.66319 39.28427))
2 3 Block 245102804033017 #fff 4.999286 NaN POLYGON ((-76.71122 39.27927, -76.71121 39.27975, -76.71121 39.27990, -76.71119 39.28076, -76.71045 39.27932, -76.71045 39.27920, -76.71056 39.27907, -76.71123 39.27880, -76.71122 39.27927))
3 4 Block 245102716006003 #fff 4.999286 NaN POLYGON ((-76.66960 39.34512, -76.66905 39.34526, -76.66886 39.34531, -76.66848 39.34539, -76.66835 39.34502, -76.66867 39.34498, -76.66924 39.34484, -76.66946 39.34475, -76.66960 39.34512))
4 5 Block 245102709023013 #fff 4.999286 NaN POLYGON ((-76.58976 39.35401, -76.58972 39.35415, -76.58966 39.35435, -76.58964 39.35447, -76.58960 39.35469, -76.58940 39.35467, -76.58925 39.35465, -76.58912 39.35465, -76.58902 39.35464, -76.58898 39.35464, -76.58906 39.35425, -76.58917 39.35385, -76.58925 39.35385, -76.58939 39.35386, -76.58979 39.35393, -76.58976 39.35401))
gdf.plot(column = 'value')
point_df = gdf.copy() point_df['centroids'] = df.centroid point_df = point_df.drop(columns = 'geometry') point_df = point_df.set_geometry('centroids') point_df.head(1) point_df.plot(marker='o', color='red', markersize='value')point_df.value.hist()point_df[point_df.value > 1000].plot()baltimore = gpd.read_file("https://opendata.arcgis.com/datasets/b738a8587b6d479a8824d937892701d8_0.geojson")from dataplay.geoms import workWithGeometryDatabaltimore.columnsIndex(['OBJECTID', 'CSA2010', 'tpop10', 'male10', 'female10', 'paa17', 'pwhite17', 'pasi17', 'p2more17', 'ppac17', 'phisp17', 'racdiv17', 'age5_17', 'age18_17', 'age24_17', 'age64_17', 'age65_17', 'hhs10', 'femhhs17', 'fam17', 'hhsize10', 'mhhi17', 'hh25inc17', 'hh40inc17', 'hh60inc17', 'hh75inc17', 'hhm7517', 'hhpov17', 'hhchpov17', 'Shape__Area', 'Shape__Length', 'geometry'], dtype='object')pointsInPolys = workWithGeometryData(method = 'ponp', df = point_df, polys = baltimore, ptsCoordCol ='centroids', polygonsCoordCol = 'geometry', polygonsLabel = 'CSA2010') pointsInPolys.plot(column='value', legend=True, markersize = 1)Total Points: 13506.0 Total Points in Polygons: 13453 Prcnt Points in Polygons: 0.9960758181548941
ponp_copy = pointsInPolys.copy()ponp_copy.value.isnull().groupby([ponp_copy['CSA2010']]).sum().astype(int).reset_index(name='NumberMissingCount').head()
CSA2010 NumberMissingCount
0 Allendale/Irvington/S. Hilton 290
1 Beechfield/Ten Hills/West Hills 180
2 Belair-Edison 250
3 Brooklyn/Curtis Bay/Hawkins Point 580
4 Canton 0
ponp_copy.value.notnull().groupby([ponp_copy['CSA2010']]).sum().astype(int) / ponp_copy.value.isnull().groupby([ponp_copy['CSA2010']]).sum().astype(int)ponp_copy.value.isnull().groupby([ponp_copy['CSA2010']]).sum().astype(int) / ponp_copy.value.groupby([ponp_copy['CSA2010']]).sum().astype(int) * 100ponp_copy.fillna(-1).groupby('CSA2010')['value'].sum()gdf.value.unique()gdf.value.describe()count 5677.000000 mean 40.862603 std 175.769408 min 1.000000 25% 2.000000 50% 5.000000 75% 21.000000 max 6978.000000 Name: value, dtype: float64gdf.value.var() # Return unbiased variance over requested axis.30894.884924369213gdf.value.sem() # Return unbiased standard error of the mean over requested axis.2.332834040372481gdf.value.nunique() # Count distinct observations over requested axis.364# DataFrame.shape Return a tuple representing the dimensionality of the DataFrame. gdf.shape #DataFrame.size Return an int representing the number of elements in this object. gdf.size # DataFrame.ndim Return an integer representing the number of axes/array dimensions. gdf.ndim # Note Used : # DataFrame.axes Return a list representing the axes of the DataFrame. gdf.dtypes # Return unbiased kurtosis over requested axis using Fisher’s definition of kurtosis (kurtosis of normal == 0.0). gdf.kurtosis()(13506, 6)810362id object name object color object radius float64 value float64 geometry geometry dtype: objectid -1.200000 radius 1132.851562 value 495.784213 dtype: float64

Load Tracts

# This will just beautify the output. pd.set_option('display.expand_frame_repr', False) pd.set_option('display.precision', 2) from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" pd.set_option('max_colwidth', 20)# The attributes are what we will use. in_crs = 2248 # The CRS we recieve our data. out_crs = 4326 # The CRS we would like to have our data represented as. geom = 'geometry' # The column where our spatial information lives. # A Url to load boundariesBaltimoreTractsNoWater2010 = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQ8xXdUaT17jkdK0MWTJpg3GOy6jMWeaXTlguXNjCSb8Vr_FanSZQRaTU-m811fQz4kyMFK5wcahMNY/pub?gid=886223646&single=true&output=csv" # Read in the dataframe gdf = pd.read_csv(boundariesBaltimoreTractsNoWater2010) # Convert the geometry column datatype from a string of text into a coordinate datatype. gdf['geometry'] = gdf['geometry'].apply(lambda x: loads( str(x) )) # Process the dataframe as a geodataframe with a known CRS and geom column. gdf = GeoDataFrame(gdf, crs=in_crs, geometry='geometry')boundariesBaltimoreTractsNoWater2010.head()

Ensure merge is on consistent datatypes

gdf['GEOID10'] = gdf['GEOID10'].astype(str) scooterdf['nameChange2'] = scooterdf['nameChange2'].astype(str)

Perform the merge

scooterdfClean = gdf.merge(scooterdf, left_on='GEOID10', right_on='nameChange2').drop(['name', 'nameChange1', 'nameChange2'], axis=1)scooterdfClean.head()# scooterdf.to_csv('./scooterdf.csv', index=False) # gdf.drop(columns='geometry').to_csv('./boundsdf.csv', index=False)lsscooterdfClean.groupby('CSA')['value'].sum()scooterdfClean.value.isna().groupby([scooterdfClean['CSA']]).sum().astype(int).reset_index(name='notApplicable')scooterdfClean.value.notnull().groupby([scooterdfClean['CSA']]).sum().astype(int).reset_index(name='NotMissingCount')scooterdfClean.value.isnull().groupby([scooterdfClean['CSA']]).sum().astype(int).reset_index(name='NumberMissingCount')scooterdfClean.fillna(-1).groupby('CSA')['value'].sum()scooterdfClean.groupby('CSA')['value'].mean()scooterdfClean.groupby('CSA')['value'].sum()scooterdfClean.CSA.value_counts()https://pandas.pydata.org/docs/getting_started/intro_tutorials/06_calculate_statistics.html

Inspect Routes Data

fileName = 'Routing December 2019.geojson' #@param ['Routing August 2019.geojson', 'Routing October 2019.geojson', 'Routing December 2019.geojson', 'Routing September 2019.geojson', 'Routing November 2019.geojson'] columnName = "streetname" #@param ['id', 'color', 'streetname', 'trip_count_sum', 'trip_count_average', 'trip_count_percent'] gdf = gpd.read_file( findFile('./', fileName) ) gdf.head()
id color streetname trip_count_sum trip_count_average trip_count_percent geometry
0 150197772 rgb(218, 231, 241) Jefferson Street 24 0.774194 0.0% LINESTRING (-76.58576 39.29660, -76.58550 39.29661)
1 150155955 rgb(164, 190, 219) 206 6.645161 0.3% LINESTRING (-76.61320 39.28136, -76.61318 39.28115)
2 150191673 rgb(204, 221, 236) Harford Avenue 49 1.580645 0.1% LINESTRING (-76.60169 39.30535, -76.60163 39.30544, -76.60156 39.30555)
3 150184657 rgb(229, 239, 246) 11 0.354839 0.0% LINESTRING (-76.61493 39.29294, -76.61491 39.29258, -76.61490 39.29230)
4 150188407 rgb(169, 194, 221) 178 5.741935 0.2% LINESTRING (-76.61662 39.29053, -76.61661 39.29046)
baltimore = gpd.read_file("https://opendata.arcgis.com/datasets/b738a8587b6d479a8824d937892701d8_0.geojson")tst = """background = alt.Chart(gdf).mark_geoshape( stroke='white', strokeWidth=2 ).encode( color=alt.value('#eee'), ).properties( width=700, height=500 )""" # GeoDataFrame could be passed as usual pd.DataFrame city = alt.Chart(baltimore).mark_geoshape( ).project( ).encode( color='tpop10', # shorthand infer types as for regular pd.DataFrame tooltip='CSA2010' # GeoDataFrame.index is accessible as id ).properties( width=500, height=300 ) routes = alt.Chart(gdf).mark_geoshape( filled=False, strokeWidth=2 ) city + routes

Clean the gdf of empties.

gdf = gdf[~gdf.isna()] gdf = gdf[~gdf.is_empty]

Now get the extremities; this will take a minute.

# hide_output gdf['lefty'], gdf['leftx'],gdf['righty'], gdf['rightx'] = zip(*gdf["geometry"].map(split))

Split the gdf into a left and right dataset

gdf_left = gdf.copy() gdf_left = gdf_left.drop(columns = ['geometry','rightx', 'righty']) gdf_right= gdf.copy() gdf_right = gdf_right.drop(columns = ['geometry','leftx', 'lefty', 'streetname', 'trip_count_sum', 'trip_count_average', 'trip_count_percent', 'color' ])

The coordinate variables of object type will cause problems.

gdf_right.dtypesid int64 righty float64 rightx object dtype: object

Let's go ahead and coerce a correction.

gdf_right['righty']=pd.to_numeric(gdf_right['righty'], errors='coerce') gdf_left['lefty']=pd.to_numeric(gdf_left['lefty'], errors='coerce') gdf_right['rightx']=pd.to_numeric(gdf_right['rightx'], errors='coerce') gdf_left['leftx']=pd.to_numeric(gdf_left['leftx'], errors='coerce')

Now we can view the results

gdf_right.dtypesid int64 righty float64 rightx float64 dtype: object

Save these csv's because it took a minute to get to where we are now.

gdf_right.to_csv('rightRouts.csv') gdf_left.to_csv('leftRouts.csv')

Convert the datasets to geodataframes for further exploration!

# We could create a geodataframe like this #temp_gdf = gpd.GeoDataFrame( gdf_right, geometry=gpd.points_from_xy(gdf_right.rightx, gdf_right.righty) ) #temp_gdf.head() # Alternately this could work.. unfinished.. but wkt.loads can make a Point from text # gdf_right['strCol']=gdf_right['rightx'].astype(str) # gdf_right['geometry'] = gdf_right['strCol'].apply(wkt.loads)# hide_output left_csaMap = readInGeometryData(url=gdf_left, porg='p', geom= False, lat= 'leftx', lng= 'lefty', revgeocode=False, save=False, in_crs=4268, out_crs=4268)# hide_output right_csaMap = readInGeometryData(url=gdf_right, porg='p', geom= False, lat= 'rightx', lng= 'righty', revgeocode=False, save=False, in_crs=4268, out_crs=4268)right_csaMap.head()
id righty rightx geometry
0 150197772 -76.585503 39.296607 POINT (-76.58550 39.29661)
1 150155955 -76.613183 39.281147 POINT (-76.61318 39.28115)
2 150191673 -76.601555 39.305545 POINT (-76.60156 39.30555)
3 150184657 -76.614899 39.292301 POINT (-76.61490 39.29230)
4 150188407 -76.616608 39.290458 POINT (-76.61661 39.29046)
left_csaMap.head()left_csaMap.plot(column='trip_count_sum')right_csaMap.plot()baltimore.columns# hide_output right_points_in_poly = getPointsInPolygons(right_csaMap, baltimore, 'geometry', 'geometry')right_csaMap.head()left_points_in_poly = getPointsInPolygons(left_csaMap, baltimore, 'geometry', 'geometry')left_csaMap.head()left_points_in_poly.drop('geometry', axis=1)[['pointsinpolygon']].head(20)
pointsinpolygon
0 18
1 0
2 67
3 0
4 1536
5 18
6 32
7 8
8 1
9 65
10 0
11 0
12 13
13 1764
14 0
15 760
16 0
17 85
18 1253
19 2
right_points_in_poly.drop('geometry', axis=1)[['pointsinpolygon']].head(20)
pointsinpolygon
0 18
1 0
2 67
3 0
4 1536
5 18
6 32
7 8
8 1
9 65
10 0
11 0
12 13
13 1764
14 0
15 760
16 0
17 85
18 1253
19 2
left_points_in_poly.plot(column='pointsinpolygon', legend=True)
right_points_in_poly.plot(column='pointsinpolygon', legend=True)
right_points_in_polyscooterdfClean = left_points_in_poly.merge(right_points_in_poly, left_on='CSA2010', right_on='CSA2010', how='left')scooterdfClean.columnsright_points_in_poly.columnsIndex(['OBJECTID', 'CSA2010', 'tpop10', 'male10', 'female10', 'paa17', 'pwhite17', 'pasi17', 'p2more17', 'ppac17', 'phisp17', 'racdiv17', 'age5_17', 'age18_17', 'age24_17', 'age64_17', 'age65_17', 'hhs10', 'femhhs17', 'fam17', 'hhsize10', 'mhhi17', 'hh25inc17', 'hh40inc17', 'hh60inc17', 'hh75inc17', 'hhm7517', 'hhpov17', 'hhchpov17', 'Shape__Area', 'Shape__Length', 'geometry', 'pointsinpolygon'], dtype='object')right_points_in_poly.head()right_points_in_poly.plot('pointsinpolygon', legend=True)
import altair as alt import geopandas as gpd import gpdvega # GeoDataFrame could be passed as usual pd.DataFrame chart = alt.Chart(right_points_in_poly).mark_geoshape( ).project( ).encode( color='pointsinpolygon', # shorthand infer types as for regular pd.DataFrame tooltip=['CSA2010','pointsinpolygon'] # GeoDataFrame.index is accessible as id ).properties( width=500, height=300 ) routes = alt.Chart(gdf).mark_geoshape( filled=False, strokeWidth=2 ) chart + routes chart.save(fileName[:-8]+'.html', embed_options={'renderer':'svg'}) chart.save(fileName[:-8]+'.json')

Inspect Origins-Destinations Data

ls 'Trip origins-destinations by month'# @title Example form fields #@markdown Forms support many types of fields. fileName = 'Trip Destinations by block August 2019.geojson' #@param ['Trip Destinations by block August 2019.geojson', 'Trip Destinations by block December 2019.geojson', 'Trip Destinations by block November 2019.geojson', 'Trip Destinations by block October 2019.geojson', 'Trip Destinations by block September 2019.geojson', 'Trip Origins by block August 2019.geojson', 'Trip Origins by block December 2019.geojson', 'Trip Origins by block November 2019.geojson', 'Trip Origins by block October 2019.geojson', 'Trip Origins by block September 2019.geojson'] columnName = "name" #@param ['name', 'value', 'color', 'radius'] #@markdown --- gdf = gpd.read_file( findFile('./', fileName) ) gdf.plot( column = columnName) gdf.columns gdf[['id','name', 'value', 'color', 'radius']].head(5)dxp.bar(x='color', y='value', data=gdf, aggfunc='median')dxp.scatter(x = "color", y = "value", data = gdf, aggfunc = "median")#hide !pip install nbdev from nbdev import * ! nbdev_upgrade

Introduction

Binder Binder Binder Open Source Love svg3

NPM License Active Python Versions GitHub last commit No Maintenance Intended

GitHub stars GitHub watchers GitHub forks GitHub followers

Tweet Twitter Follow

This chapter has brought to you in collaboration by Brian Kelly, Michael Vandi, Logan Shertz, and Charles Karpati.

Dataset: Scooter data:

  • Routes: 3 months (September to August 2019)
  • Deployment/
  • Routes
  • Trip origins-destinations by month
# do you see this? - Carlos# Hello world! - Michael# Checking - Brian

Local File Access (Optional)

# hide_output # (Optional) Run this cell to gain access to Google Drive (Colabs only) from google.colab import drive # Colabs operates in a virtualized enviornment # Colabs default directory is at ~/content. # We mount Drive into a temporary folder at '~/content/drive' drive.mount('/content/drive')Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code Enter your authorization code: ·········· Mounted at /content/drive

File path's will vary

# cd ./drive/'My Drive'/BNIA/responsive_records/Routes# cd ../content/drive/My Drive/DATA/scooter/content/drive/My Drive/DATA/scooter # Michael's Directory # cd ./drive/'My Drive'/BNIA/'Scooter Use Data'/BNIAcd ./Routes/content/drive/My Drive/DATA/scooter/Routes # hide_output ! ls leftRouts.csv 'Routing November 2019.geojson' rightRouts.csv 'Routing October 2019.geojson' 'Routing August 2019.geojson' 'Routing September 2019.geojson' 'Routing December 2019.geojson' # hide_output !cd ../ && ls boundsdf.csv rightRouts.csv 'Trip origins-destinations by month' Deployment Routes leftRouts.csv scooterdf.csv

Installs

! pip install dexplot !pip install folium !pip install geopandas !pip install ipyleaflet !pip install gpdvega !pip install dataplay

Imports

import os import pandas as pd import geopandas as gpd import dexplot as dxp import folium as fol import json import altair as alt import gpdvega import dataplay # These imports will handle everything import os import sys import csv from IPython.display import clear_output import matplotlib.pyplot as plt import numpy as np import pandas as pd import geopandas as gpd from geopandas import GeoDataFrame import psycopg2 import pyproj from pyproj import Proj, transform # conda install -c conda-forge proj4 from shapely.geometry import Point from shapely import wkb from shapely.wkt import loads # https://pypi.org/project/geopy/ from geopy.geocoders import Nominatim import folium from folium import plugins from dataplay.merge import mergeDatasets import dexplot as dxp # In case file is KML, enable support import fiona fiona.drvsupport.supported_drivers['kml'] = 'rw' fiona.drvsupport.supported_drivers['KML'] = 'rw' #this cell is good to copy from shapely.geometry import LineString pd.plotting.register_matplotlib_converters() from dataplay.geoms import readInGeometryData from shapely import wkt from dataplay.geoms import readInGeometryData import ipywidgets as widgets !jupyter nbextension enable --py widgetsnbextension from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = 'all' import ipywidgets as widgets from ipywidgets import interact, interact_manual alt.data_transformers.enable('default', max_rows=None)

Convenience Functions

# Return the boundaries of a Linestring def split(geom): print( str( type(geom)) ) #Linestring might not be the actual type, so this may need to be fied. #IF its not a linestring then dont run it and stuff 'false' and the datatype if str( type(geom)) == "" and not str(geom.boundary) == 'MULTIPOINT EMPTY': print(geom.boundary) left, right = geom.boundary print('left x: ', type(left.x), left.x) print('left y: ', type(left.y), left.y) print('right x: ', type(right.x), right.x) print('right y: ', type(right.y), right.y) return left.x, left.y, right.x, right.y else: return False, type(geom), False, type(geom)# To 'import' a script you wrote, map its filepath into the sys def getPointsInPolygons(pts, polygons, ptsCoordCol, polygonsCoordCol): count = 0 total = pts.size / len(pts.columns) # We're going to keep a list of how many points we find. pts_in_polys = [] # Loop over polygons with index i. for i, poly in polygons.iterrows(): # print('Searching for point within Geom:', poly ) # Keep a list of points in this poly pts_in_this_poly = [] # Now loop over all points with index j. for j, pt in pts.iterrows(): if poly[polygonsCoordCol].contains(pt[ptsCoordCol]): # Then it's a hit! Add it to the list, # and drop it so we have less hunting. pts_in_this_poly.append(pt[ptsCoordCol]) pts = pts.drop([j]) # We could do all sorts, like grab a property of the # points, but let's just append the number of them. pts_in_polys.append(len(pts_in_this_poly)) print('Found this many points within the Geom:', len(pts_in_this_poly) ) count = count + len(pts_in_this_poly) clear_output(wait=True) # Add the number of points for each poly to the dataframe. polygons['pointsinpolygon'] = gpd.GeoSeries(pts_in_polys) print( 'Total Points: ', total ) print( 'Total Points in Polygons: ', count ) print( 'Prcnt Points in Polygons: ', count / total ) return polygons

File Access Convenience Functions

# Find Relative Path to Files def findFile(root, file): for d, subD, f in os.walk(root): if file in f: return "{1}/{0}".format(file, d) break # To 'import' a script you wrote, map its filepath into the sys def addPath(root, file): sys.path.append(os.path.abspath( findFile( './', file) ))findFile('./', 'Routing September 2019.geojson')'.//Routing September 2019.geojson'ls leftRouts.csv 'Routing November 2019.geojson' rightRouts.csv 'Routing October 2019.geojson' 'Routing August 2019.geojson' 'Routing September 2019.geojson' 'Routing December 2019.geojson'

Inspect Deployment Data

ls 'Trip origins-destinations by month''Trip Destinations by block August 2019.geojson' 'Trip Destinations by block December 2019.geojson' 'Trip Destinations by block November 2019.geojson' 'Trip Destinations by block October 2019.geojson' 'Trip Destinations by block September 2019.geojson' 'Trip Origins by block August 2019.geojson' 'Trip Origins by block December 2019.geojson' 'Trip Origins by block November 2019.geojson' 'Trip Origins by block October 2019.geojson' 'Trip Origins by block September 2019.geojson' #@title Example form fields #@markdown Forms support many types of fields. fileName = "Trip Origins by block September 2019.geojson" #@param ['Daily Deployment average by block August 2019.csv', 'Daily Deployment average by block December 2019.csv', 'Daily Deployment average by block November 2019.csv', 'Daily Deployment average by block October 2019.csv', 'Daily Deployment average by block September 2019.csv', 'Trip Destinations by block August 2019.geojson','Trip Destinations by block December 2019.geojson','Trip Destinations by block November 2019.geojson', 'Trip Destinations by block October 2019.geojson', 'Trip Destinations by block September 2019.geojson', 'Trip Origins by block August 2019.geojson','Trip Origins by block December 2019.geojson','Trip Origins by block November 2019.geojson','Trip Origins by block October 2019.geojson','Trip Origins by block September 2019.geojson'] #@markdown ---gdf = gpd.read_file( findFile('./', fileName) ) gdf.head()
id name color radius value geometry
0 1 Block 245102502063018 #fff 4.999286 NaN POLYGON ((-76.66168 39.26342, -76.66136 39.26373, -76.66096 39.26343, -76.66041 39.26306, -76.66053 39.26296, -76.66070 39.26281, -76.66077 39.26284, -76.66100 39.26296, -76.66117 39.26306, -76.66129 39.26315, -76.66141 39.26322, -76.66168 39.26342))
1 2 Block 245102006001022 #fff 4.999286 NaN POLYGON ((-76.66319 39.28427, -76.66306 39.28436, -76.66301 39.28439, -76.66293 39.28439, -76.66271 39.28418, -76.66303 39.28402, -76.66317 39.28417, -76.66320 39.28423, -76.66319 39.28427))
2 3 Block 245102804033017 #fff 4.999286 NaN POLYGON ((-76.71122 39.27927, -76.71121 39.27975, -76.71121 39.27990, -76.71119 39.28076, -76.71045 39.27932, -76.71045 39.27920, -76.71056 39.27907, -76.71123 39.27880, -76.71122 39.27927))
3 4 Block 245102716006003 #fff 4.999286 NaN POLYGON ((-76.66960 39.34512, -76.66905 39.34526, -76.66886 39.34531, -76.66848 39.34539, -76.66835 39.34502, -76.66867 39.34498, -76.66924 39.34484, -76.66946 39.34475, -76.66960 39.34512))
4 5 Block 245102709023013 #fff 4.999286 NaN POLYGON ((-76.58976 39.35401, -76.58972 39.35415, -76.58966 39.35435, -76.58964 39.35447, -76.58960 39.35469, -76.58940 39.35467, -76.58925 39.35465, -76.58912 39.35465, -76.58902 39.35464, -76.58898 39.35464, -76.58906 39.35425, -76.58917 39.35385, -76.58925 39.35385, -76.58939 39.35386, -76.58979 39.35393, -76.58976 39.35401))
gdf.plot(column = 'value')
point_df = gdf.copy() point_df['centroids'] = df.centroid point_df = point_df.drop(columns = 'geometry') point_df = point_df.set_geometry('centroids') point_df.head(1) point_df.plot(marker='o', color='red', markersize='value')point_df.value.hist()point_df[point_df.value > 1000].plot()baltimore = gpd.read_file("https://opendata.arcgis.com/datasets/b738a8587b6d479a8824d937892701d8_0.geojson")from dataplay.geoms import workWithGeometryDatabaltimore.columnsIndex(['OBJECTID', 'CSA2010', 'tpop10', 'male10', 'female10', 'paa17', 'pwhite17', 'pasi17', 'p2more17', 'ppac17', 'phisp17', 'racdiv17', 'age5_17', 'age18_17', 'age24_17', 'age64_17', 'age65_17', 'hhs10', 'femhhs17', 'fam17', 'hhsize10', 'mhhi17', 'hh25inc17', 'hh40inc17', 'hh60inc17', 'hh75inc17', 'hhm7517', 'hhpov17', 'hhchpov17', 'Shape__Area', 'Shape__Length', 'geometry'], dtype='object')pointsInPolys = workWithGeometryData(method = 'ponp', df = point_df, polys = baltimore, ptsCoordCol ='centroids', polygonsCoordCol = 'geometry', polygonsLabel = 'CSA2010') pointsInPolys.plot(column='value', legend=True, markersize = 1)Total Points: 13506.0 Total Points in Polygons: 13453 Prcnt Points in Polygons: 0.9960758181548941
ponp_copy = pointsInPolys.copy()ponp_copy.value.isnull().groupby([ponp_copy['CSA2010']]).sum().astype(int).reset_index(name='NumberMissingCount').head()
CSA2010 NumberMissingCount
0 Allendale/Irvington/S. Hilton 290
1 Beechfield/Ten Hills/West Hills 180
2 Belair-Edison 250
3 Brooklyn/Curtis Bay/Hawkins Point 580
4 Canton 0
ponp_copy.value.notnull().groupby([ponp_copy['CSA2010']]).sum().astype(int) / ponp_copy.value.isnull().groupby([ponp_copy['CSA2010']]).sum().astype(int)ponp_copy.value.isnull().groupby([ponp_copy['CSA2010']]).sum().astype(int) / ponp_copy.value.groupby([ponp_copy['CSA2010']]).sum().astype(int) * 100ponp_copy.fillna(-1).groupby('CSA2010')['value'].sum()gdf.value.unique()gdf.value.describe()count 5677.000000 mean 40.862603 std 175.769408 min 1.000000 25% 2.000000 50% 5.000000 75% 21.000000 max 6978.000000 Name: value, dtype: float64gdf.value.var() # Return unbiased variance over requested axis.30894.884924369213gdf.value.sem() # Return unbiased standard error of the mean over requested axis.2.332834040372481gdf.value.nunique() # Count distinct observations over requested axis.364# DataFrame.shape Return a tuple representing the dimensionality of the DataFrame. gdf.shape #DataFrame.size Return an int representing the number of elements in this object. gdf.size # DataFrame.ndim Return an integer representing the number of axes/array dimensions. gdf.ndim # Note Used : # DataFrame.axes Return a list representing the axes of the DataFrame. gdf.dtypes # Return unbiased kurtosis over requested axis using Fisher’s definition of kurtosis (kurtosis of normal == 0.0). gdf.kurtosis()(13506, 6)810362id object name object color object radius float64 value float64 geometry geometry dtype: objectid -1.200000 radius 1132.851562 value 495.784213 dtype: float64

Load Tracts

# This will just beautify the output. pd.set_option('display.expand_frame_repr', False) pd.set_option('display.precision', 2) from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" pd.set_option('max_colwidth', 20)# The attributes are what we will use. in_crs = 2248 # The CRS we recieve our data. out_crs = 4326 # The CRS we would like to have our data represented as. geom = 'geometry' # The column where our spatial information lives. # A Url to load boundariesBaltimoreTractsNoWater2010 = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQ8xXdUaT17jkdK0MWTJpg3GOy6jMWeaXTlguXNjCSb8Vr_FanSZQRaTU-m811fQz4kyMFK5wcahMNY/pub?gid=886223646&single=true&output=csv" # Read in the dataframe gdf = pd.read_csv(boundariesBaltimoreTractsNoWater2010) # Convert the geometry column datatype from a string of text into a coordinate datatype. gdf['geometry'] = gdf['geometry'].apply(lambda x: loads( str(x) )) # Process the dataframe as a geodataframe with a known CRS and geom column. gdf = GeoDataFrame(gdf, crs=in_crs, geometry='geometry')boundariesBaltimoreTractsNoWater2010.head()

Ensure merge is on consistent datatypes

gdf['GEOID10'] = gdf['GEOID10'].astype(str) scooterdf['nameChange2'] = scooterdf['nameChange2'].astype(str)

Perform the merge

scooterdfClean = gdf.merge(scooterdf, left_on='GEOID10', right_on='nameChange2').drop(['name', 'nameChange1', 'nameChange2'], axis=1)scooterdfClean.head()# scooterdf.to_csv('./scooterdf.csv', index=False) # gdf.drop(columns='geometry').to_csv('./boundsdf.csv', index=False)lsscooterdfClean.groupby('CSA')['value'].sum()scooterdfClean.value.isna().groupby([scooterdfClean['CSA']]).sum().astype(int).reset_index(name='notApplicable')scooterdfClean.value.notnull().groupby([scooterdfClean['CSA']]).sum().astype(int).reset_index(name='NotMissingCount')scooterdfClean.value.isnull().groupby([scooterdfClean['CSA']]).sum().astype(int).reset_index(name='NumberMissingCount')scooterdfClean.fillna(-1).groupby('CSA')['value'].sum()scooterdfClean.groupby('CSA')['value'].mean()scooterdfClean.groupby('CSA')['value'].sum()scooterdfClean.CSA.value_counts()https://pandas.pydata.org/docs/getting_started/intro_tutorials/06_calculate_statistics.html

Inspect Routes Data

fileName = 'Routing December 2019.geojson' #@param ['Routing August 2019.geojson', 'Routing October 2019.geojson', 'Routing December 2019.geojson', 'Routing September 2019.geojson', 'Routing November 2019.geojson'] columnName = "streetname" #@param ['id', 'color', 'streetname', 'trip_count_sum', 'trip_count_average', 'trip_count_percent'] gdf = gpd.read_file( findFile('./', fileName) ) gdf.head()
id color streetname trip_count_sum trip_count_average trip_count_percent geometry
0 150197772 rgb(218, 231, 241) Jefferson Street 24 0.774194 0.0% LINESTRING (-76.58576 39.29660, -76.58550 39.29661)
1 150155955 rgb(164, 190, 219) 206 6.645161 0.3% LINESTRING (-76.61320 39.28136, -76.61318 39.28115)
2 150191673 rgb(204, 221, 236) Harford Avenue 49 1.580645 0.1% LINESTRING (-76.60169 39.30535, -76.60163 39.30544, -76.60156 39.30555)
3 150184657 rgb(229, 239, 246) 11 0.354839 0.0% LINESTRING (-76.61493 39.29294, -76.61491 39.29258, -76.61490 39.29230)
4 150188407 rgb(169, 194, 221) 178 5.741935 0.2% LINESTRING (-76.61662 39.29053, -76.61661 39.29046)
baltimore = gpd.read_file("https://opendata.arcgis.com/datasets/b738a8587b6d479a8824d937892701d8_0.geojson")tst = """background = alt.Chart(gdf).mark_geoshape( stroke='white', strokeWidth=2 ).encode( color=alt.value('#eee'), ).properties( width=700, height=500 )""" # GeoDataFrame could be passed as usual pd.DataFrame city = alt.Chart(baltimore).mark_geoshape( ).project( ).encode( color='tpop10', # shorthand infer types as for regular pd.DataFrame tooltip='CSA2010' # GeoDataFrame.index is accessible as id ).properties( width=500, height=300 ) routes = alt.Chart(gdf).mark_geoshape( filled=False, strokeWidth=2 ) city + routes

Clean the gdf of empties.

gdf = gdf[~gdf.isna()] gdf = gdf[~gdf.is_empty]

Now get the extremities; this will take a minute.

# hide_output gdf['lefty'], gdf['leftx'],gdf['righty'], gdf['rightx'] = zip(*gdf["geometry"].map(split))

Split the gdf into a left and right dataset

gdf_left = gdf.copy() gdf_left = gdf_left.drop(columns = ['geometry','rightx', 'righty']) gdf_right= gdf.copy() gdf_right = gdf_right.drop(columns = ['geometry','leftx', 'lefty', 'streetname', 'trip_count_sum', 'trip_count_average', 'trip_count_percent', 'color' ])

The coordinate variables of object type will cause problems.

gdf_right.dtypesid int64 righty float64 rightx object dtype: object

Let's go ahead and coerce a correction.

gdf_right['righty']=pd.to_numeric(gdf_right['righty'], errors='coerce') gdf_left['lefty']=pd.to_numeric(gdf_left['lefty'], errors='coerce') gdf_right['rightx']=pd.to_numeric(gdf_right['rightx'], errors='coerce') gdf_left['leftx']=pd.to_numeric(gdf_left['leftx'], errors='coerce')

Now we can view the results

gdf_right.dtypesid int64 righty float64 rightx float64 dtype: object

Save these csv's because it took a minute to get to where we are now.

gdf_right.to_csv('rightRouts.csv') gdf_left.to_csv('leftRouts.csv')

Convert the datasets to geodataframes for further exploration!

# We could create a geodataframe like this #temp_gdf = gpd.GeoDataFrame( gdf_right, geometry=gpd.points_from_xy(gdf_right.rightx, gdf_right.righty) ) #temp_gdf.head() # Alternately this could work.. unfinished.. but wkt.loads can make a Point from text # gdf_right['strCol']=gdf_right['rightx'].astype(str) # gdf_right['geometry'] = gdf_right['strCol'].apply(wkt.loads)# hide_output left_csaMap = readInGeometryData(url=gdf_left, porg='p', geom= False, lat= 'leftx', lng= 'lefty', revgeocode=False, save=False, in_crs=4268, out_crs=4268)# hide_output right_csaMap = readInGeometryData(url=gdf_right, porg='p', geom= False, lat= 'rightx', lng= 'righty', revgeocode=False, save=False, in_crs=4268, out_crs=4268)right_csaMap.head()
id righty rightx geometry
0 150197772 -76.585503 39.296607 POINT (-76.58550 39.29661)
1 150155955 -76.613183 39.281147 POINT (-76.61318 39.28115)
2 150191673 -76.601555 39.305545 POINT (-76.60156 39.30555)
3 150184657 -76.614899 39.292301 POINT (-76.61490 39.29230)
4 150188407 -76.616608 39.290458 POINT (-76.61661 39.29046)
left_csaMap.head()left_csaMap.plot(column='trip_count_sum')right_csaMap.plot()baltimore.columns# hide_output right_points_in_poly = getPointsInPolygons(right_csaMap, baltimore, 'geometry', 'geometry')right_csaMap.head()left_points_in_poly = getPointsInPolygons(left_csaMap, baltimore, 'geometry', 'geometry')left_csaMap.head()left_points_in_poly.drop('geometry', axis=1)[['pointsinpolygon']].head(20)
pointsinpolygon
0 18
1 0
2 67
3 0
4 1536
5 18
6 32
7 8
8 1
9 65
10 0
11 0
12 13
13 1764
14 0
15 760
16 0
17 85
18 1253
19 2
right_points_in_poly.drop('geometry', axis=1)[['pointsinpolygon']].head(20)
pointsinpolygon
0 18
1 0
2 67
3 0
4 1536
5 18
6 32
7 8
8 1
9 65
10 0
11 0
12 13
13 1764
14 0
15 760
16 0
17 85
18 1253
19 2
left_points_in_poly.plot(column='pointsinpolygon', legend=True)
right_points_in_poly.plot(column='pointsinpolygon', legend=True)
right_points_in_polyscooterdfClean = left_points_in_poly.merge(right_points_in_poly, left_on='CSA2010', right_on='CSA2010', how='left')scooterdfClean.columnsright_points_in_poly.columnsIndex(['OBJECTID', 'CSA2010', 'tpop10', 'male10', 'female10', 'paa17', 'pwhite17', 'pasi17', 'p2more17', 'ppac17', 'phisp17', 'racdiv17', 'age5_17', 'age18_17', 'age24_17', 'age64_17', 'age65_17', 'hhs10', 'femhhs17', 'fam17', 'hhsize10', 'mhhi17', 'hh25inc17', 'hh40inc17', 'hh60inc17', 'hh75inc17', 'hhm7517', 'hhpov17', 'hhchpov17', 'Shape__Area', 'Shape__Length', 'geometry', 'pointsinpolygon'], dtype='object')right_points_in_poly.head()right_points_in_poly.plot('pointsinpolygon', legend=True)
import altair as alt import geopandas as gpd import gpdvega # GeoDataFrame could be passed as usual pd.DataFrame chart = alt.Chart(right_points_in_poly).mark_geoshape( ).project( ).encode( color='pointsinpolygon', # shorthand infer types as for regular pd.DataFrame tooltip=['CSA2010','pointsinpolygon'] # GeoDataFrame.index is accessible as id ).properties( width=500, height=300 ) routes = alt.Chart(gdf).mark_geoshape( filled=False, strokeWidth=2 ) chart + routes chart.save(fileName[:-8]+'.html', embed_options={'renderer':'svg'}) chart.save(fileName[:-8]+'.json')

Inspect Origins-Destinations Data

ls 'Trip origins-destinations by month'# @title Example form fields #@markdown Forms support many types of fields. fileName = 'Trip Destinations by block August 2019.geojson' #@param ['Trip Destinations by block August 2019.geojson', 'Trip Destinations by block December 2019.geojson', 'Trip Destinations by block November 2019.geojson', 'Trip Destinations by block October 2019.geojson', 'Trip Destinations by block September 2019.geojson', 'Trip Origins by block August 2019.geojson', 'Trip Origins by block December 2019.geojson', 'Trip Origins by block November 2019.geojson', 'Trip Origins by block October 2019.geojson', 'Trip Origins by block September 2019.geojson'] columnName = "name" #@param ['name', 'value', 'color', 'radius'] #@markdown --- gdf = gpd.read_file( findFile('./', fileName) ) gdf.plot( column = columnName) gdf.columns gdf[['id','name', 'value', 'color', 'radius']].head(5)dxp.bar(x='color', y='value', data=gdf, aggfunc='median')dxp.scatter(x = "color", y = "value", data = gdf, aggfunc = "median")#hide !pip install nbdev from nbdev import * ! nbdev_upgrade

Introduction

Binder Binder Binder Open Source Love svg3

NPM License Active Python Versions GitHub last commit No Maintenance Intended

GitHub stars GitHub watchers GitHub forks GitHub followers

Tweet Twitter Follow

This chapter has brought to you in collaboration by Brian Kelly, Michael Vandi, Logan Shertz, and Charles Karpati.

Dataset: Scooter data:

  • Routes: 3 months (September to August 2019)
  • Deployment/
  • Routes
  • Trip origins-destinations by month
# do you see this? - Carlos# Hello world! - Michael# Checking - Brian

Local File Access (Optional)

# hide_output # (Optional) Run this cell to gain access to Google Drive (Colabs only) from google.colab import drive # Colabs operates in a virtualized enviornment # Colabs default directory is at ~/content. # We mount Drive into a temporary folder at '~/content/drive' drive.mount('/content/drive')Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code Enter your authorization code: ·········· Mounted at /content/drive

File path's will vary

# cd ./drive/'My Drive'/BNIA/responsive_records/Routes# cd ../content/drive/My Drive/DATA/scooter/content/drive/My Drive/DATA/scooter # Michael's Directory # cd ./drive/'My Drive'/BNIA/'Scooter Use Data'/BNIAcd ./Routes/content/drive/My Drive/DATA/scooter/Routes # hide_output ! ls leftRouts.csv 'Routing November 2019.geojson' rightRouts.csv 'Routing October 2019.geojson' 'Routing August 2019.geojson' 'Routing September 2019.geojson' 'Routing December 2019.geojson' # hide_output !cd ../ && ls boundsdf.csv rightRouts.csv 'Trip origins-destinations by month' Deployment Routes leftRouts.csv scooterdf.csv

Installs

! pip install dexplot !pip install folium !pip install geopandas !pip install ipyleaflet !pip install gpdvega !pip install dataplay

Imports

import os import pandas as pd import geopandas as gpd import dexplot as dxp import folium as fol import json import altair as alt import gpdvega import dataplay # These imports will handle everything import os import sys import csv from IPython.display import clear_output import matplotlib.pyplot as plt import numpy as np import pandas as pd import geopandas as gpd from geopandas import GeoDataFrame import psycopg2 import pyproj from pyproj import Proj, transform # conda install -c conda-forge proj4 from shapely.geometry import Point from shapely import wkb from shapely.wkt import loads # https://pypi.org/project/geopy/ from geopy.geocoders import Nominatim import folium from folium import plugins from dataplay.merge import mergeDatasets import dexplot as dxp # In case file is KML, enable support import fiona fiona.drvsupport.supported_drivers['kml'] = 'rw' fiona.drvsupport.supported_drivers['KML'] = 'rw' #this cell is good to copy from shapely.geometry import LineString pd.plotting.register_matplotlib_converters() from dataplay.geoms import readInGeometryData from shapely import wkt from dataplay.geoms import readInGeometryData import ipywidgets as widgets !jupyter nbextension enable --py widgetsnbextension from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = 'all' import ipywidgets as widgets from ipywidgets import interact, interact_manual alt.data_transformers.enable('default', max_rows=None)

Convenience Functions

# Return the boundaries of a Linestring def split(geom): print( str( type(geom)) ) #Linestring might not be the actual type, so this may need to be fied. #IF its not a linestring then dont run it and stuff 'false' and the datatype if str( type(geom)) == "" and not str(geom.boundary) == 'MULTIPOINT EMPTY': print(geom.boundary) left, right = geom.boundary print('left x: ', type(left.x), left.x) print('left y: ', type(left.y), left.y) print('right x: ', type(right.x), right.x) print('right y: ', type(right.y), right.y) return left.x, left.y, right.x, right.y else: return False, type(geom), False, type(geom)# To 'import' a script you wrote, map its filepath into the sys def getPointsInPolygons(pts, polygons, ptsCoordCol, polygonsCoordCol): count = 0 total = pts.size / len(pts.columns) # We're going to keep a list of how many points we find. pts_in_polys = [] # Loop over polygons with index i. for i, poly in polygons.iterrows(): # print('Searching for point within Geom:', poly ) # Keep a list of points in this poly pts_in_this_poly = [] # Now loop over all points with index j. for j, pt in pts.iterrows(): if poly[polygonsCoordCol].contains(pt[ptsCoordCol]): # Then it's a hit! Add it to the list, # and drop it so we have less hunting. pts_in_this_poly.append(pt[ptsCoordCol]) pts = pts.drop([j]) # We could do all sorts, like grab a property of the # points, but let's just append the number of them. pts_in_polys.append(len(pts_in_this_poly)) print('Found this many points within the Geom:', len(pts_in_this_poly) ) count = count + len(pts_in_this_poly) clear_output(wait=True) # Add the number of points for each poly to the dataframe. polygons['pointsinpolygon'] = gpd.GeoSeries(pts_in_polys) print( 'Total Points: ', total ) print( 'Total Points in Polygons: ', count ) print( 'Prcnt Points in Polygons: ', count / total ) return polygons

File Access Convenience Functions

# Find Relative Path to Files def findFile(root, file): for d, subD, f in os.walk(root): if file in f: return "{1}/{0}".format(file, d) break # To 'import' a script you wrote, map its filepath into the sys def addPath(root, file): sys.path.append(os.path.abspath( findFile( './', file) ))findFile('./', 'Routing September 2019.geojson')'.//Routing September 2019.geojson'ls leftRouts.csv 'Routing November 2019.geojson' rightRouts.csv 'Routing October 2019.geojson' 'Routing August 2019.geojson' 'Routing September 2019.geojson' 'Routing December 2019.geojson'

Inspect Deployment Data

ls 'Trip origins-destinations by month''Trip Destinations by block August 2019.geojson' 'Trip Destinations by block December 2019.geojson' 'Trip Destinations by block November 2019.geojson' 'Trip Destinations by block October 2019.geojson' 'Trip Destinations by block September 2019.geojson' 'Trip Origins by block August 2019.geojson' 'Trip Origins by block December 2019.geojson' 'Trip Origins by block November 2019.geojson' 'Trip Origins by block October 2019.geojson' 'Trip Origins by block September 2019.geojson' #@title Example form fields #@markdown Forms support many types of fields. fileName = "Trip Origins by block September 2019.geojson" #@param ['Daily Deployment average by block August 2019.csv', 'Daily Deployment average by block December 2019.csv', 'Daily Deployment average by block November 2019.csv', 'Daily Deployment average by block October 2019.csv', 'Daily Deployment average by block September 2019.csv', 'Trip Destinations by block August 2019.geojson','Trip Destinations by block December 2019.geojson','Trip Destinations by block November 2019.geojson', 'Trip Destinations by block October 2019.geojson', 'Trip Destinations by block September 2019.geojson', 'Trip Origins by block August 2019.geojson','Trip Origins by block December 2019.geojson','Trip Origins by block November 2019.geojson','Trip Origins by block October 2019.geojson','Trip Origins by block September 2019.geojson'] #@markdown ---gdf = gpd.read_file( findFile('./', fileName) ) gdf.head()
id name color radius value geometry
0 1 Block 245102502063018 #fff 4.999286 NaN POLYGON ((-76.66168 39.26342, -76.66136 39.26373, -76.66096 39.26343, -76.66041 39.26306, -76.66053 39.26296, -76.66070 39.26281, -76.66077 39.26284, -76.66100 39.26296, -76.66117 39.26306, -76.66129 39.26315, -76.66141 39.26322, -76.66168 39.26342))
1 2 Block 245102006001022 #fff 4.999286 NaN POLYGON ((-76.66319 39.28427, -76.66306 39.28436, -76.66301 39.28439, -76.66293 39.28439, -76.66271 39.28418, -76.66303 39.28402, -76.66317 39.28417, -76.66320 39.28423, -76.66319 39.28427))
2 3 Block 245102804033017 #fff 4.999286 NaN POLYGON ((-76.71122 39.27927, -76.71121 39.27975, -76.71121 39.27990, -76.71119 39.28076, -76.71045 39.27932, -76.71045 39.27920, -76.71056 39.27907, -76.71123 39.27880, -76.71122 39.27927))
3 4 Block 245102716006003 #fff 4.999286 NaN POLYGON ((-76.66960 39.34512, -76.66905 39.34526, -76.66886 39.34531, -76.66848 39.34539, -76.66835 39.34502, -76.66867 39.34498, -76.66924 39.34484, -76.66946 39.34475, -76.66960 39.34512))
4 5 Block 245102709023013 #fff 4.999286 NaN POLYGON ((-76.58976 39.35401, -76.58972 39.35415, -76.58966 39.35435, -76.58964 39.35447, -76.58960 39.35469, -76.58940 39.35467, -76.58925 39.35465, -76.58912 39.35465, -76.58902 39.35464, -76.58898 39.35464, -76.58906 39.35425, -76.58917 39.35385, -76.58925 39.35385, -76.58939 39.35386, -76.58979 39.35393, -76.58976 39.35401))
gdf.plot(column = 'value')
point_df = gdf.copy() point_df['centroids'] = df.centroid point_df = point_df.drop(columns = 'geometry') point_df = point_df.set_geometry('centroids') point_df.head(1) point_df.plot(marker='o', color='red', markersize='value')point_df.value.hist()point_df[point_df.value > 1000].plot()baltimore = gpd.read_file("https://opendata.arcgis.com/datasets/b738a8587b6d479a8824d937892701d8_0.geojson")from dataplay.geoms import workWithGeometryDatabaltimore.columnsIndex(['OBJECTID', 'CSA2010', 'tpop10', 'male10', 'female10', 'paa17', 'pwhite17', 'pasi17', 'p2more17', 'ppac17', 'phisp17', 'racdiv17', 'age5_17', 'age18_17', 'age24_17', 'age64_17', 'age65_17', 'hhs10', 'femhhs17', 'fam17', 'hhsize10', 'mhhi17', 'hh25inc17', 'hh40inc17', 'hh60inc17', 'hh75inc17', 'hhm7517', 'hhpov17', 'hhchpov17', 'Shape__Area', 'Shape__Length', 'geometry'], dtype='object')pointsInPolys = workWithGeometryData(method = 'ponp', df = point_df, polys = baltimore, ptsCoordCol ='centroids', polygonsCoordCol = 'geometry', polygonsLabel = 'CSA2010') pointsInPolys.plot(column='value', legend=True, markersize = 1)Total Points: 13506.0 Total Points in Polygons: 13453 Prcnt Points in Polygons: 0.9960758181548941
ponp_copy = pointsInPolys.copy()ponp_copy.value.isnull().groupby([ponp_copy['CSA2010']]).sum().astype(int).reset_index(name='NumberMissingCount').head()
CSA2010 NumberMissingCount
0 Allendale/Irvington/S. Hilton 290
1 Beechfield/Ten Hills/West Hills 180
2 Belair-Edison 250
3 Brooklyn/Curtis Bay/Hawkins Point 580
4 Canton 0
ponp_copy.value.notnull().groupby([ponp_copy['CSA2010']]).sum().astype(int) / ponp_copy.value.isnull().groupby([ponp_copy['CSA2010']]).sum().astype(int)ponp_copy.value.isnull().groupby([ponp_copy['CSA2010']]).sum().astype(int) / ponp_copy.value.groupby([ponp_copy['CSA2010']]).sum().astype(int) * 100ponp_copy.fillna(-1).groupby('CSA2010')['value'].sum()gdf.value.unique()gdf.value.describe()count 5677.000000 mean 40.862603 std 175.769408 min 1.000000 25% 2.000000 50% 5.000000 75% 21.000000 max 6978.000000 Name: value, dtype: float64gdf.value.var() # Return unbiased variance over requested axis.30894.884924369213gdf.value.sem() # Return unbiased standard error of the mean over requested axis.2.332834040372481gdf.value.nunique() # Count distinct observations over requested axis.364# DataFrame.shape Return a tuple representing the dimensionality of the DataFrame. gdf.shape #DataFrame.size Return an int representing the number of elements in this object. gdf.size # DataFrame.ndim Return an integer representing the number of axes/array dimensions. gdf.ndim # Note Used : # DataFrame.axes Return a list representing the axes of the DataFrame. gdf.dtypes # Return unbiased kurtosis over requested axis using Fisher’s definition of kurtosis (kurtosis of normal == 0.0). gdf.kurtosis()(13506, 6)810362id object name object color object radius float64 value float64 geometry geometry dtype: objectid -1.200000 radius 1132.851562 value 495.784213 dtype: float64

Load Tracts

# This will just beautify the output. pd.set_option('display.expand_frame_repr', False) pd.set_option('display.precision', 2) from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" pd.set_option('max_colwidth', 20)# The attributes are what we will use. in_crs = 2248 # The CRS we recieve our data. out_crs = 4326 # The CRS we would like to have our data represented as. geom = 'geometry' # The column where our spatial information lives. # A Url to load boundariesBaltimoreTractsNoWater2010 = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQ8xXdUaT17jkdK0MWTJpg3GOy6jMWeaXTlguXNjCSb8Vr_FanSZQRaTU-m811fQz4kyMFK5wcahMNY/pub?gid=886223646&single=true&output=csv" # Read in the dataframe gdf = pd.read_csv(boundariesBaltimoreTractsNoWater2010) # Convert the geometry column datatype from a string of text into a coordinate datatype. gdf['geometry'] = gdf['geometry'].apply(lambda x: loads( str(x) )) # Process the dataframe as a geodataframe with a known CRS and geom column. gdf = GeoDataFrame(gdf, crs=in_crs, geometry='geometry')boundariesBaltimoreTractsNoWater2010.head()

Ensure merge is on consistent datatypes

gdf['GEOID10'] = gdf['GEOID10'].astype(str) scooterdf['nameChange2'] = scooterdf['nameChange2'].astype(str)

Perform the merge

scooterdfClean = gdf.merge(scooterdf, left_on='GEOID10', right_on='nameChange2').drop(['name', 'nameChange1', 'nameChange2'], axis=1)scooterdfClean.head()# scooterdf.to_csv('./scooterdf.csv', index=False) # gdf.drop(columns='geometry').to_csv('./boundsdf.csv', index=False)lsscooterdfClean.groupby('CSA')['value'].sum()scooterdfClean.value.isna().groupby([scooterdfClean['CSA']]).sum().astype(int).reset_index(name='notApplicable')scooterdfClean.value.notnull().groupby([scooterdfClean['CSA']]).sum().astype(int).reset_index(name='NotMissingCount')scooterdfClean.value.isnull().groupby([scooterdfClean['CSA']]).sum().astype(int).reset_index(name='NumberMissingCount')scooterdfClean.fillna(-1).groupby('CSA')['value'].sum()scooterdfClean.groupby('CSA')['value'].mean()scooterdfClean.groupby('CSA')['value'].sum()scooterdfClean.CSA.value_counts()https://pandas.pydata.org/docs/getting_started/intro_tutorials/06_calculate_statistics.html

Inspect Routes Data

fileName = 'Routing December 2019.geojson' #@param ['Routing August 2019.geojson', 'Routing October 2019.geojson', 'Routing December 2019.geojson', 'Routing September 2019.geojson', 'Routing November 2019.geojson'] columnName = "streetname" #@param ['id', 'color', 'streetname', 'trip_count_sum', 'trip_count_average', 'trip_count_percent'] gdf = gpd.read_file( findFile('./', fileName) ) gdf.head()
id color streetname trip_count_sum trip_count_average trip_count_percent geometry
0 150197772 rgb(218, 231, 241) Jefferson Street 24 0.774194 0.0% LINESTRING (-76.58576 39.29660, -76.58550 39.29661)
1 150155955 rgb(164, 190, 219) 206 6.645161 0.3% LINESTRING (-76.61320 39.28136, -76.61318 39.28115)
2 150191673 rgb(204, 221, 236) Harford Avenue 49 1.580645 0.1% LINESTRING (-76.60169 39.30535, -76.60163 39.30544, -76.60156 39.30555)
3 150184657 rgb(229, 239, 246) 11 0.354839 0.0% LINESTRING (-76.61493 39.29294, -76.61491 39.29258, -76.61490 39.29230)
4 150188407 rgb(169, 194, 221) 178 5.741935 0.2% LINESTRING (-76.61662 39.29053, -76.61661 39.29046)
baltimore = gpd.read_file("https://opendata.arcgis.com/datasets/b738a8587b6d479a8824d937892701d8_0.geojson")tst = """background = alt.Chart(gdf).mark_geoshape( stroke='white', strokeWidth=2 ).encode( color=alt.value('#eee'), ).properties( width=700, height=500 )""" # GeoDataFrame could be passed as usual pd.DataFrame city = alt.Chart(baltimore).mark_geoshape( ).project( ).encode( color='tpop10', # shorthand infer types as for regular pd.DataFrame tooltip='CSA2010' # GeoDataFrame.index is accessible as id ).properties( width=500, height=300 ) routes = alt.Chart(gdf).mark_geoshape( filled=False, strokeWidth=2 ) city + routes

Clean the gdf of empties.

gdf = gdf[~gdf.isna()] gdf = gdf[~gdf.is_empty]

Now get the extremities; this will take a minute.

# hide_output gdf['lefty'], gdf['leftx'],gdf['righty'], gdf['rightx'] = zip(*gdf["geometry"].map(split))

Split the gdf into a left and right dataset

gdf_left = gdf.copy() gdf_left = gdf_left.drop(columns = ['geometry','rightx', 'righty']) gdf_right= gdf.copy() gdf_right = gdf_right.drop(columns = ['geometry','leftx', 'lefty', 'streetname', 'trip_count_sum', 'trip_count_average', 'trip_count_percent', 'color' ])

The coordinate variables of object type will cause problems.

gdf_right.dtypesid int64 righty float64 rightx object dtype: object

Let's go ahead and coerce a correction.

gdf_right['righty']=pd.to_numeric(gdf_right['righty'], errors='coerce') gdf_left['lefty']=pd.to_numeric(gdf_left['lefty'], errors='coerce') gdf_right['rightx']=pd.to_numeric(gdf_right['rightx'], errors='coerce') gdf_left['leftx']=pd.to_numeric(gdf_left['leftx'], errors='coerce')

Now we can view the results

gdf_right.dtypesid int64 righty float64 rightx float64 dtype: object

Save these csv's because it took a minute to get to where we are now.

gdf_right.to_csv('rightRouts.csv') gdf_left.to_csv('leftRouts.csv')

Convert the datasets to geodataframes for further exploration!

# We could create a geodataframe like this #temp_gdf = gpd.GeoDataFrame( gdf_right, geometry=gpd.points_from_xy(gdf_right.rightx, gdf_right.righty) ) #temp_gdf.head() # Alternately this could work.. unfinished.. but wkt.loads can make a Point from text # gdf_right['strCol']=gdf_right['rightx'].astype(str) # gdf_right['geometry'] = gdf_right['strCol'].apply(wkt.loads)# hide_output left_csaMap = readInGeometryData(url=gdf_left, porg='p', geom= False, lat= 'leftx', lng= 'lefty', revgeocode=False, save=False, in_crs=4268, out_crs=4268)# hide_output right_csaMap = readInGeometryData(url=gdf_right, porg='p', geom= False, lat= 'rightx', lng= 'righty', revgeocode=False, save=False, in_crs=4268, out_crs=4268)right_csaMap.head()
id righty rightx geometry
0 150197772 -76.585503 39.296607 POINT (-76.58550 39.29661)
1 150155955 -76.613183 39.281147 POINT (-76.61318 39.28115)
2 150191673 -76.601555 39.305545 POINT (-76.60156 39.30555)
3 150184657 -76.614899 39.292301 POINT (-76.61490 39.29230)
4 150188407 -76.616608 39.290458 POINT (-76.61661 39.29046)
left_csaMap.head()left_csaMap.plot(column='trip_count_sum')right_csaMap.plot()baltimore.columns# hide_output right_points_in_poly = getPointsInPolygons(right_csaMap, baltimore, 'geometry', 'geometry')right_csaMap.head()left_points_in_poly = getPointsInPolygons(left_csaMap, baltimore, 'geometry', 'geometry')left_csaMap.head()left_points_in_poly.drop('geometry', axis=1)[['pointsinpolygon']].head(20)
pointsinpolygon
0 18
1 0
2 67
3 0
4 1536
5 18
6 32
7 8
8 1
9 65
10 0
11 0
12 13
13 1764
14 0
15 760
16 0
17 85
18 1253
19 2
right_points_in_poly.drop('geometry', axis=1)[['pointsinpolygon']].head(20)
pointsinpolygon
0 18
1 0
2 67
3 0
4 1536
5 18
6 32
7 8
8 1
9 65
10 0
11 0
12 13
13 1764
14 0
15 760
16 0
17 85
18 1253
19 2
left_points_in_poly.plot(column='pointsinpolygon', legend=True)
right_points_in_poly.plot(column='pointsinpolygon', legend=True)
right_points_in_polyscooterdfClean = left_points_in_poly.merge(right_points_in_poly, left_on='CSA2010', right_on='CSA2010', how='left')scooterdfClean.columnsright_points_in_poly.columnsIndex(['OBJECTID', 'CSA2010', 'tpop10', 'male10', 'female10', 'paa17', 'pwhite17', 'pasi17', 'p2more17', 'ppac17', 'phisp17', 'racdiv17', 'age5_17', 'age18_17', 'age24_17', 'age64_17', 'age65_17', 'hhs10', 'femhhs17', 'fam17', 'hhsize10', 'mhhi17', 'hh25inc17', 'hh40inc17', 'hh60inc17', 'hh75inc17', 'hhm7517', 'hhpov17', 'hhchpov17', 'Shape__Area', 'Shape__Length', 'geometry', 'pointsinpolygon'], dtype='object')right_points_in_poly.head()right_points_in_poly.plot('pointsinpolygon', legend=True)
import altair as alt import geopandas as gpd import gpdvega # GeoDataFrame could be passed as usual pd.DataFrame chart = alt.Chart(right_points_in_poly).mark_geoshape( ).project( ).encode( color='pointsinpolygon', # shorthand infer types as for regular pd.DataFrame tooltip=['CSA2010','pointsinpolygon'] # GeoDataFrame.index is accessible as id ).properties( width=500, height=300 ) routes = alt.Chart(gdf).mark_geoshape( filled=False, strokeWidth=2 ) chart + routes chart.save(fileName[:-8]+'.html', embed_options={'renderer':'svg'}) chart.save(fileName[:-8]+'.json')

Inspect Origins-Destinations Data

ls 'Trip origins-destinations by month'# @title Example form fields #@markdown Forms support many types of fields. fileName = 'Trip Destinations by block August 2019.geojson' #@param ['Trip Destinations by block August 2019.geojson', 'Trip Destinations by block December 2019.geojson', 'Trip Destinations by block November 2019.geojson', 'Trip Destinations by block October 2019.geojson', 'Trip Destinations by block September 2019.geojson', 'Trip Origins by block August 2019.geojson', 'Trip Origins by block December 2019.geojson', 'Trip Origins by block November 2019.geojson', 'Trip Origins by block October 2019.geojson', 'Trip Origins by block September 2019.geojson'] columnName = "name" #@param ['name', 'value', 'color', 'radius'] #@markdown --- gdf = gpd.read_file( findFile('./', fileName) ) gdf.plot( column = columnName) gdf.columns gdf[['id','name', 'value', 'color', 'radius']].head(5)dxp.bar(x='color', y='value', data=gdf, aggfunc='median')dxp.scatter(x = "color", y = "value", data = gdf, aggfunc = "median")